78 files changed, 17513 insertions, 3017 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index f56c7e172cee..5519d257b556 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2246,6 +2246,14 @@ L:	linux-mtd@lists.infradead.org
 T:      git git://git.infradead.org/mtd-2.6.git
 S:      Maintained
+UNSORTED BLOCK IMAGES (UBI)
+P:      Artem Bityutskiy
+M:      dedekind@infradead.org
+W:      http://www.linux-mtd.infradead.org/
+L:      linux-mtd@lists.infradead.org
+T:      git git://git.infradead.org/ubi-2.6.git
+S:      Maintained
 MICROTEK X6 SCANNER
 P:      Oliver Neukum
 M:      oliver@neukum.name
@@ -2972,8 +2980,10 @@ P:	Stephen Smalley
 M:      sds@tycho.nsa.gov
 P:      James Morris
 M:      jmorris@namei.org
+P:      Eric Paris
+M:      eparis@parisplace.org
 L:      linux-kernel@vger.kernel.org (kernel issues)
-L:      selinux@tycho.nsa.gov (general discussion)
+L:      selinux@tycho.nsa.gov (subscribers-only, general discussion)
 W:      http://www.nsa.gov/selinux
 S:      Supported
diff --git a/drivers/mtd/Kconfig b/drivers/mtd/Kconfig
index 26f75c299440..6d1b91bf7ad5 100644
--- a/drivers/mtd/Kconfig
+++ b/drivers/mtd/Kconfig
@@ -292,5 +292,7 @@ source "drivers/mtd/nand/Kconfig"
 source "drivers/mtd/onenand/Kconfig"
+source "drivers/mtd/ubi/Kconfig"
 endmenu
diff --git a/drivers/mtd/Makefile b/drivers/mtd/Makefile
index c130e6261adf..92055405cb30 100644
--- a/drivers/mtd/Makefile
+++ b/drivers/mtd/Makefile
@@ -28,3 +28,5 @@ nftl-objs		:= nftlcore.o nftlmount.o
 inftl-objs              := inftlcore.o inftlmount.o
 obj-y           += chips/ maps/ devices/ nand/ onenand/
+obj-$(CONFIG_MTD_UBI)           += ubi/
diff --git a/drivers/mtd/ubi/Kconfig b/drivers/mtd/ubi/Kconfig
new file mode 100644
index 000000000000..b9daf159a4a7
--- /dev/null
+++ b/drivers/mtd/ubi/Kconfig
@@ -0,0 +1,58 @@
+# drivers/mtd/ubi/Kconfig
+menu "UBI - Unsorted block images"
+        depends on MTD
+config MTD_UBI
+        tristate "Enable UBI"
+        depends on MTD
+        select CRC32
+        help
+          UBI is a software layer above MTD layer which admits of LVM-like
+          logical volumes on top of MTD devices, hides some complexities of
+          flash chips like wear and bad blocks and provides some other useful
+          capabilities. Please, consult the MTD web site for more details
+          (www.linux-mtd.infradead.org).
+config MTD_UBI_WL_THRESHOLD
+        int "UBI wear-leveling threshold"
+        default 4096
+        range 2 65536
+        depends on MTD_UBI
+        help
+          This parameter defines the maximum difference between the highest
+          erase counter value and the lowest erase counter value of eraseblocks
+          of UBI devices. When this threshold is exceeded, UBI starts performing
+          wear leveling by means of moving data from eraseblock with low erase
+          counter to eraseblocks with high erase counter. Leave the default
+          value if unsure.
+config MTD_UBI_BEB_RESERVE
+        int "Percentage of reserved eraseblocks for bad eraseblocks handling"
+        default 1
+        range 0 25
+        depends on MTD_UBI
+        help
+          If the MTD device admits of bad eraseblocks (e.g. NAND flash), UBI
+          reserves some amount of physical eraseblocks to handle new bad
+          eraseblocks. For example, if a flash physical eraseblock becomes bad,
+          UBI uses these reserved physical eraseblocks to relocate the bad one.
+          This option specifies how many physical eraseblocks will be reserved
+          for bad eraseblock handling (percents of total number of good flash
+          eraseblocks). If the underlying flash does not admit of bad
+          eraseblocks (e.g. NOR flash), this value is ignored and nothing is
+          reserved. Leave the default value if unsure.
+config MTD_UBI_GLUEBI
+        bool "Emulate MTD devices"
+        default n
+        depends on MTD_UBI
+        help
+           This option enables MTD devices emulation on top of UBI volumes: for
+           each UBI volumes an MTD device is created, and all I/O to this MTD
+           device is redirected to the UBI volume. This is handy to make
+           MTD-oriented software (like JFFS2) work on top of UBI. Do not enable
+           this if no legacy software will be used.
+source "drivers/mtd/ubi/Kconfig.debug"
+endmenu
diff --git a/drivers/mtd/ubi/Kconfig.debug b/drivers/mtd/ubi/Kconfig.debug
new file mode 100644
index 000000000000..1e2ee22edeff
--- /dev/null
+++ b/drivers/mtd/ubi/Kconfig.debug
@@ -0,0 +1,104 @@
+comment "UBI debugging options"
+        depends on MTD_UBI
+config MTD_UBI_DEBUG
+        bool "UBI debugging"
+        depends on SYSFS
+        depends on MTD_UBI
+        select DEBUG_FS
+        select KALLSYMS_ALL
+        help
+          This option enables UBI debugging.
+config MTD_UBI_DEBUG_MSG
+        bool "UBI debugging messages"
+        depends on MTD_UBI_DEBUG
+        default n
+        help
+          This option enables UBI debugging messages.
+config MTD_UBI_DEBUG_PARANOID
+        bool "Extra self-checks"
+        default n
+        depends on MTD_UBI_DEBUG
+        help
+          This option enables extra checks in UBI code. Note this slows UBI down
+          significantly.
+config MTD_UBI_DEBUG_DISABLE_BGT
+        bool "Do not enable the UBI background thread"
+        depends on MTD_UBI_DEBUG
+        default n
+        help
+          This option switches the background thread off by default. The thread
+          may be also be enabled/disabled via UBI sysfs.
+config MTD_UBI_DEBUG_USERSPACE_IO
+        bool "Direct user-space write/erase support"
+        default n
+        depends on MTD_UBI_DEBUG
+        help
+          By default, users cannot directly write and erase individual
+          eraseblocks of dynamic volumes, and have to use update operation
+          instead. This option enables this capability - it is very useful for
+          debugging and testing.
+config MTD_UBI_DEBUG_EMULATE_BITFLIPS
+        bool "Emulate flash bit-flips"
+        depends on MTD_UBI_DEBUG
+        default n
+        help
+          This option emulates bit-flips with probability 1/50, which in turn
+          causes scrubbing. Useful for debugging and stressing UBI.
+config MTD_UBI_DEBUG_EMULATE_WRITE_FAILURES
+        bool "Emulate flash write failures"
+        depends on MTD_UBI_DEBUG
+        default n
+        help
+          This option emulates write failures with probability 1/100. Useful for
+          debugging and testing how UBI handlines errors.
+config MTD_UBI_DEBUG_EMULATE_ERASE_FAILURES
+        bool "Emulate flash erase failures"
+        depends on MTD_UBI_DEBUG
+        default n
+        help
+          This option emulates erase failures with probability 1/100. Useful for
+          debugging and testing how UBI handlines errors.
+menu "Additional UBI debugging messages"
+        depends on MTD_UBI_DEBUG
+config MTD_UBI_DEBUG_MSG_BLD
+        bool "Additional UBI initialization and build messages"
+        default n
+        depends on MTD_UBI_DEBUG
+        help
+          This option enables detailed UBI initialization and device build
+          debugging messages.
+config MTD_UBI_DEBUG_MSG_EBA
+        bool "Eraseblock association unit messages"
+        default n
+        depends on MTD_UBI_DEBUG
+        help
+          This option enables debugging messages from the UBI eraseblock
+          association unit.
+config MTD_UBI_DEBUG_MSG_WL
+        bool "Wear-leveling unit messages"
+        default n
+        depends on MTD_UBI_DEBUG
+        help
+          This option enables debugging messages from the UBI wear-leveling
+          unit.
+config MTD_UBI_DEBUG_MSG_IO
+        bool "Input/output unit messages"
+        default n
+        depends on MTD_UBI_DEBUG
+        help
+          This option enables debugging messages from the UBI input/output unit.
+endmenu # UBI debugging messages
diff --git a/drivers/mtd/ubi/Makefile b/drivers/mtd/ubi/Makefile
new file mode 100644
index 000000000000..dd834e04151b
--- /dev/null
+++ b/drivers/mtd/ubi/Makefile
@@ -0,0 +1,7 @@
+obj-$(CONFIG_MTD_UBI) += ubi.o
+ubi-y += vtbl.o vmt.o upd.o build.o cdev.o kapi.o eba.o io.o wl.o scan.o
+ubi-y += misc.o
+ubi-$(CONFIG_MTD_UBI_DEBUG) += debug.o
+ubi-$(CONFIG_MTD_UBI_GLUEBI) += gluebi.o
diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c
new file mode 100644
index 000000000000..555d594d1811
--- /dev/null
+++ b/drivers/mtd/ubi/build.c
@@ -0,0 +1,848 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2006
+ * Copyright (c) Nokia Corporation, 2007
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Artem Bityutskiy (Битюцкий Артём),
+ *         Frank Haverkamp
+ */
+/*
+ * This file includes UBI initialization and building of UBI devices. At the
+ * moment UBI devices may only be added while UBI is initialized, but dynamic
+ * device add/remove functionality is planned. Also, at the moment we only
+ * attach UBI devices by scanning, which will become a bottleneck when flashes
+ * reach certain large size. Then one may improve UBI and add other methods.
+ */
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/stringify.h>
+#include <linux/stat.h>
+#include "ubi.h"
+/* Maximum length of the 'mtd=' parameter */
+#define MTD_PARAM_LEN_MAX 64
+/**
+ * struct mtd_dev_param - MTD device parameter description data structure.
+ * @name: MTD device name or number string
+ * @vid_hdr_offs: VID header offset
+ * @data_offs: data offset
+ */
+struct mtd_dev_param
+{
+        char name[MTD_PARAM_LEN_MAX];
+        int vid_hdr_offs;
+        int data_offs;
+};
+/* Numbers of elements set in the @mtd_dev_param array */
+static int mtd_devs = 0;
+/* MTD devices specification parameters */
+static struct mtd_dev_param mtd_dev_param[UBI_MAX_DEVICES];
+/* Number of UBI devices in system */
+int ubi_devices_cnt;
+/* All UBI devices in system */
+struct ubi_device *ubi_devices[UBI_MAX_DEVICES];
+/* Root UBI "class" object (corresponds to '/<sysfs>/class/ubi/') */
+struct class *ubi_class;
+/* "Show" method for files in '/<sysfs>/class/ubi/' */
+static ssize_t ubi_version_show(struct class *class, char *buf)
+{
+        return sprintf(buf, "%d\n", UBI_VERSION);
+}
+/* UBI version attribute ('/<sysfs>/class/ubi/version') */
+static struct class_attribute ubi_version =
+        __ATTR(version, S_IRUGO, ubi_version_show, NULL);
+static ssize_t dev_attribute_show(struct device *dev,
+                                  struct device_attribute *attr, char *buf);
+/* UBI device attributes (correspond to files in '/<sysfs>/class/ubi/ubiX') */
+static struct device_attribute dev_eraseblock_size =
+        __ATTR(eraseblock_size, S_IRUGO, dev_attribute_show, NULL);
+static struct device_attribute dev_avail_eraseblocks =
+        __ATTR(avail_eraseblocks, S_IRUGO, dev_attribute_show, NULL);
+static struct device_attribute dev_total_eraseblocks =
+        __ATTR(total_eraseblocks, S_IRUGO, dev_attribute_show, NULL);
+static struct device_attribute dev_volumes_count =
+        __ATTR(volumes_count, S_IRUGO, dev_attribute_show, NULL);
+static struct device_attribute dev_max_ec =
+        __ATTR(max_ec, S_IRUGO, dev_attribute_show, NULL);
+static struct device_attribute dev_reserved_for_bad =
+        __ATTR(reserved_for_bad, S_IRUGO, dev_attribute_show, NULL);
+static struct device_attribute dev_bad_peb_count =
+        __ATTR(bad_peb_count, S_IRUGO, dev_attribute_show, NULL);
+static struct device_attribute dev_max_vol_count =
+        __ATTR(max_vol_count, S_IRUGO, dev_attribute_show, NULL);
+static struct device_attribute dev_min_io_size =
+        __ATTR(min_io_size, S_IRUGO, dev_attribute_show, NULL);
+static struct device_attribute dev_bgt_enabled =
+        __ATTR(bgt_enabled, S_IRUGO, dev_attribute_show, NULL);
+/* "Show" method for files in '/<sysfs>/class/ubi/ubiX/' */
+static ssize_t dev_attribute_show(struct device *dev,
+                                  struct device_attribute *attr, char *buf)
+{
+        const struct ubi_device *ubi;
+        ubi = container_of(dev, struct ubi_device, dev);
+        if (attr == &dev_eraseblock_size)
+                return sprintf(buf, "%d\n", ubi->leb_size);
+        else if (attr == &dev_avail_eraseblocks)
+                return sprintf(buf, "%d\n", ubi->avail_pebs);
+        else if (attr == &dev_total_eraseblocks)
+                return sprintf(buf, "%d\n", ubi->good_peb_count);
+        else if (attr == &dev_volumes_count)
+                return sprintf(buf, "%d\n", ubi->vol_count);
+        else if (attr == &dev_max_ec)
+                return sprintf(buf, "%d\n", ubi->max_ec);
+        else if (attr == &dev_reserved_for_bad)
+                return sprintf(buf, "%d\n", ubi->beb_rsvd_pebs);
+        else if (attr == &dev_bad_peb_count)
+                return sprintf(buf, "%d\n", ubi->bad_peb_count);
+        else if (attr == &dev_max_vol_count)
+                return sprintf(buf, "%d\n", ubi->vtbl_slots);
+        else if (attr == &dev_min_io_size)
+                return sprintf(buf, "%d\n", ubi->min_io_size);
+        else if (attr == &dev_bgt_enabled)
+                return sprintf(buf, "%d\n", ubi->thread_enabled);
+        else
+                BUG();
+        return 0;
+}
+/* Fake "release" method for UBI devices */
+static void dev_release(struct device *dev) { }
+/**
+ * ubi_sysfs_init - initialize sysfs for an UBI device.
+ * @ubi: UBI device description object
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int ubi_sysfs_init(struct ubi_device *ubi)
+{
+        int err;
+        ubi->dev.release = dev_release;
+        ubi->dev.devt = MKDEV(ubi->major, 0);
+        ubi->dev.class = ubi_class;
+        sprintf(&ubi->dev.bus_id[0], UBI_NAME_STR"%d", ubi->ubi_num);
+        err = device_register(&ubi->dev);
+        if (err)
+                goto out;
+        err = device_create_file(&ubi->dev, &dev_eraseblock_size);
+        if (err)
+                goto out_unregister;
+        err = device_create_file(&ubi->dev, &dev_avail_eraseblocks);
+        if (err)
+                goto out_eraseblock_size;
+        err = device_create_file(&ubi->dev, &dev_total_eraseblocks);
+        if (err)
+                goto out_avail_eraseblocks;
+        err = device_create_file(&ubi->dev, &dev_volumes_count);
+        if (err)
+                goto out_total_eraseblocks;
+        err = device_create_file(&ubi->dev, &dev_max_ec);
+        if (err)
+                goto out_volumes_count;
+        err = device_create_file(&ubi->dev, &dev_reserved_for_bad);
+        if (err)
+                goto out_volumes_max_ec;
+        err = device_create_file(&ubi->dev, &dev_bad_peb_count);
+        if (err)
+                goto out_reserved_for_bad;
+        err = device_create_file(&ubi->dev, &dev_max_vol_count);
+        if (err)
+                goto out_bad_peb_count;
+        err = device_create_file(&ubi->dev, &dev_min_io_size);
+        if (err)
+                goto out_max_vol_count;
+        err = device_create_file(&ubi->dev, &dev_bgt_enabled);
+        if (err)
+                goto out_min_io_size;
+        return 0;
+out_min_io_size:
+        device_remove_file(&ubi->dev, &dev_min_io_size);
+out_max_vol_count:
+        device_remove_file(&ubi->dev, &dev_max_vol_count);
+out_bad_peb_count:
+        device_remove_file(&ubi->dev, &dev_bad_peb_count);
+out_reserved_for_bad:
+        device_remove_file(&ubi->dev, &dev_reserved_for_bad);
+out_volumes_max_ec:
+        device_remove_file(&ubi->dev, &dev_max_ec);
+out_volumes_count:
+        device_remove_file(&ubi->dev, &dev_volumes_count);
+out_total_eraseblocks:
+        device_remove_file(&ubi->dev, &dev_total_eraseblocks);
+out_avail_eraseblocks:
+        device_remove_file(&ubi->dev, &dev_avail_eraseblocks);
+out_eraseblock_size:
+        device_remove_file(&ubi->dev, &dev_eraseblock_size);
+out_unregister:
+        device_unregister(&ubi->dev);
+out:
+        ubi_err("failed to initialize sysfs for %s", ubi->ubi_name);
+        return err;
+}
+/**
+ * ubi_sysfs_close - close sysfs for an UBI device.
+ * @ubi: UBI device description object
+ */
+static void ubi_sysfs_close(struct ubi_device *ubi)
+{
+        device_remove_file(&ubi->dev, &dev_bgt_enabled);
+        device_remove_file(&ubi->dev, &dev_min_io_size);
+        device_remove_file(&ubi->dev, &dev_max_vol_count);
+        device_remove_file(&ubi->dev, &dev_bad_peb_count);
+        device_remove_file(&ubi->dev, &dev_reserved_for_bad);
+        device_remove_file(&ubi->dev, &dev_max_ec);
+        device_remove_file(&ubi->dev, &dev_volumes_count);
+        device_remove_file(&ubi->dev, &dev_total_eraseblocks);
+        device_remove_file(&ubi->dev, &dev_avail_eraseblocks);
+        device_remove_file(&ubi->dev, &dev_eraseblock_size);
+        device_unregister(&ubi->dev);
+}
+/**
+ * kill_volumes - destroy all volumes.
+ * @ubi: UBI device description object
+ */
+static void kill_volumes(struct ubi_device *ubi)
+{
+        int i;
+        for (i = 0; i < ubi->vtbl_slots; i++)
+                if (ubi->volumes[i])
+                        ubi_free_volume(ubi, i);
+}
+/**
+ * uif_init - initialize user interfaces for an UBI device.
+ * @ubi: UBI device description object
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int uif_init(struct ubi_device *ubi)
+{
+        int i, err;
+        dev_t dev;
+        mutex_init(&ubi->vtbl_mutex);
+        spin_lock_init(&ubi->volumes_lock);
+        sprintf(ubi->ubi_name, UBI_NAME_STR "%d", ubi->ubi_num);
+        /*
+         * Major numbers for the UBI character devices are allocated
+         * dynamically. Major numbers of volume character devices are
+         * equivalent to ones of the corresponding UBI character device. Minor
+         * numbers of UBI character devices are 0, while minor numbers of
+         * volume character devices start from 1. Thus, we allocate one major
+         * number and ubi->vtbl_slots + 1 minor numbers.
+         */
+        err = alloc_chrdev_region(&dev, 0, ubi->vtbl_slots + 1, ubi->ubi_name);
+        if (err) {
+                ubi_err("cannot register UBI character devices");
+                return err;
+        }
+        cdev_init(&ubi->cdev, &ubi_cdev_operations);
+        ubi->major = MAJOR(dev);
+        dbg_msg("%s major is %u", ubi->ubi_name, ubi->major);
+        ubi->cdev.owner = THIS_MODULE;
+        dev = MKDEV(ubi->major, 0);
+        err = cdev_add(&ubi->cdev, dev, 1);
+        if (err) {
+                ubi_err("cannot add character device %s", ubi->ubi_name);
+                goto out_unreg;
+        }
+        err = ubi_sysfs_init(ubi);
+        if (err)
+                goto out_cdev;
+        for (i = 0; i < ubi->vtbl_slots; i++)
+                if (ubi->volumes[i]) {
+                        err = ubi_add_volume(ubi, i);
+                        if (err)
+                                goto out_volumes;
+                }
+        return 0;
+out_volumes:
+        kill_volumes(ubi);
+        ubi_sysfs_close(ubi);
+out_cdev:
+        cdev_del(&ubi->cdev);
+out_unreg:
+        unregister_chrdev_region(MKDEV(ubi->major, 0),
+                                 ubi->vtbl_slots + 1);
+        return err;
+}
+/**
+ * uif_close - close user interfaces for an UBI device.
+ * @ubi: UBI device description object
+ */
+static void uif_close(struct ubi_device *ubi)
+{
+        kill_volumes(ubi);
+        ubi_sysfs_close(ubi);
+        cdev_del(&ubi->cdev);
+        unregister_chrdev_region(MKDEV(ubi->major, 0), ubi->vtbl_slots + 1);
+}
+/**
+ * attach_by_scanning - attach an MTD device using scanning method.
+ * @ubi: UBI device descriptor
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ *
+ * Note, currently this is the only method to attach UBI devices. Hopefully in
+ * the future we'll have more scalable attaching methods and avoid full media
+ * scanning. But even in this case scanning will be needed as a fall-back
+ * attaching method if there are some on-flash table corruptions.
+ */
+static int attach_by_scanning(struct ubi_device *ubi)
+{
+        int err;
+        struct ubi_scan_info *si;
+        si = ubi_scan(ubi);
+        if (IS_ERR(si))
+                return PTR_ERR(si);
+        ubi->bad_peb_count = si->bad_peb_count;
+        ubi->good_peb_count = ubi->peb_count - ubi->bad_peb_count;
+        ubi->max_ec = si->max_ec;
+        ubi->mean_ec = si->mean_ec;
+        err = ubi_read_volume_table(ubi, si);
+        if (err)
+                goto out_si;
+        err = ubi_wl_init_scan(ubi, si);
+        if (err)
+                goto out_vtbl;
+        err = ubi_eba_init_scan(ubi, si);
+        if (err)
+                goto out_wl;
+        ubi_scan_destroy_si(si);
+        return 0;
+out_wl:
+        ubi_wl_close(ubi);
+out_vtbl:
+        kfree(ubi->vtbl);
+out_si:
+        ubi_scan_destroy_si(si);
+        return err;
+}
+/**
+ * io_init - initialize I/O unit for a given UBI device.
+ * @ubi: UBI device description object
+ *
+ * If @ubi->vid_hdr_offset or @ubi->leb_start is zero, default offsets are
+ * assumed:
+ *   o EC header is always at offset zero - this cannot be changed;
+ *   o VID header starts just after the EC header at the closest address
+ *   aligned to @io->@hdrs_min_io_size;
+ *   o data starts just after the VID header at the closest address aligned to
+ *     @io->@min_io_size
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int io_init(struct ubi_device *ubi)
+{
+        if (ubi->mtd->numeraseregions != 0) {
+                /*
+                 * Some flashes have several erase regions. Different regions
+                 * may have different eraseblock size and other
+                 * characteristics. It looks like mostly multi-region flashes
+                 * have one "main" region and one or more small regions to
+                 * store boot loader code or boot parameters or whatever. I
+                 * guess we should just pick the largest region. But this is
+                 * not implemented.
+                 */
+                ubi_err("multiple regions, not implemented");
+                return -EINVAL;
+        }
+        /*
+         * Note, in this implementation we support MTD devices with 0x7FFFFFFF
+         * physical eraseblocks maximum.
+         */
+        ubi->peb_size   = ubi->mtd->erasesize;
+        ubi->peb_count  = ubi->mtd->size / ubi->mtd->erasesize;
+        ubi->flash_size = ubi->mtd->size;
+        if (ubi->mtd->block_isbad && ubi->mtd->block_markbad)
+                ubi->bad_allowed = 1;
+        ubi->min_io_size = ubi->mtd->writesize;
+        ubi->hdrs_min_io_size = ubi->mtd->writesize >> ubi->mtd->subpage_sft;
+        /* Make sure minimal I/O unit is power of 2 */
+        if (ubi->min_io_size == 0 ||
+            (ubi->min_io_size & (ubi->min_io_size - 1))) {
+                ubi_err("bad min. I/O unit");
+                return -EINVAL;
+        }
+        ubi_assert(ubi->hdrs_min_io_size > 0);
+        ubi_assert(ubi->hdrs_min_io_size <= ubi->min_io_size);
+        ubi_assert(ubi->min_io_size % ubi->hdrs_min_io_size == 0);
+        /* Calculate default aligned sizes of EC and VID headers */
+        ubi->ec_hdr_alsize = ALIGN(UBI_EC_HDR_SIZE, ubi->hdrs_min_io_size);
+        ubi->vid_hdr_alsize = ALIGN(UBI_VID_HDR_SIZE, ubi->hdrs_min_io_size);
+        dbg_msg("min_io_size      %d", ubi->min_io_size);
+        dbg_msg("hdrs_min_io_size %d", ubi->hdrs_min_io_size);
+        dbg_msg("ec_hdr_alsize    %d", ubi->ec_hdr_alsize);
+        dbg_msg("vid_hdr_alsize   %d", ubi->vid_hdr_alsize);
+        if (ubi->vid_hdr_offset == 0)
+                /* Default offset */
+                ubi->vid_hdr_offset = ubi->vid_hdr_aloffset =
+                                      ubi->ec_hdr_alsize;
+        else {
+                ubi->vid_hdr_aloffset = ubi->vid_hdr_offset &
+                                                ~(ubi->hdrs_min_io_size - 1);
+                ubi->vid_hdr_shift = ubi->vid_hdr_offset -
+                                                ubi->vid_hdr_aloffset;
+        }
+        /* Similar for the data offset */
+        if (ubi->leb_start == 0) {
+                ubi->leb_start = ubi->vid_hdr_offset + ubi->vid_hdr_alsize;
+                ubi->leb_start = ALIGN(ubi->leb_start, ubi->min_io_size);
+        }
+        dbg_msg("vid_hdr_offset   %d", ubi->vid_hdr_offset);
+        dbg_msg("vid_hdr_aloffset %d", ubi->vid_hdr_aloffset);
+        dbg_msg("vid_hdr_shift    %d", ubi->vid_hdr_shift);
+        dbg_msg("leb_start        %d", ubi->leb_start);
+        /* The shift must be aligned to 32-bit boundary */
+        if (ubi->vid_hdr_shift % 4) {
+                ubi_err("unaligned VID header shift %d",
+                        ubi->vid_hdr_shift);
+                return -EINVAL;
+        }
+        /* Check sanity */
+        if (ubi->vid_hdr_offset < UBI_EC_HDR_SIZE ||
+            ubi->leb_start < ubi->vid_hdr_offset + UBI_VID_HDR_SIZE ||
+            ubi->leb_start > ubi->peb_size - UBI_VID_HDR_SIZE ||
+            ubi->leb_start % ubi->min_io_size) {
+                ubi_err("bad VID header (%d) or data offsets (%d)",
+                        ubi->vid_hdr_offset, ubi->leb_start);
+                return -EINVAL;
+        }
+        /*
+         * It may happen that EC and VID headers are situated in one minimal
+         * I/O unit. In this case we can only accept this UBI image in
+         * read-only mode.
+         */
+        if (ubi->vid_hdr_offset + UBI_VID_HDR_SIZE <= ubi->hdrs_min_io_size) {
+                ubi_warn("EC and VID headers are in the same minimal I/O unit, "
+                         "switch to read-only mode");
+                ubi->ro_mode = 1;
+        }
+        ubi->leb_size = ubi->peb_size - ubi->leb_start;
+        if (!(ubi->mtd->flags & MTD_WRITEABLE)) {
+                ubi_msg("MTD device %d is write-protected, attach in "
+                        "read-only mode", ubi->mtd->index);
+                ubi->ro_mode = 1;
+        }
+        dbg_msg("leb_size         %d", ubi->leb_size);
+        dbg_msg("ro_mode          %d", ubi->ro_mode);
+        /*
+         * Note, ideally, we have to initialize ubi->bad_peb_count here. But
+         * unfortunately, MTD does not provide this information. We should loop
+         * over all physical eraseblocks and invoke mtd->block_is_bad() for
+         * each physical eraseblock. So, we skip ubi->bad_peb_count
+         * uninitialized and initialize it after scanning.
+         */
+        return 0;
+}
+/**
+ * attach_mtd_dev - attach an MTD device.
+ * @mtd_dev: MTD device name or number string
+ * @vid_hdr_offset: VID header offset
+ * @data_offset: data offset
+ *
+ * This function attaches an MTD device to UBI. It first treats @mtd_dev as the
+ * MTD device name, and tries to open it by this name. If it is unable to open,
+ * it tries to convert @mtd_dev to an integer and open the MTD device by its
+ * number. Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static int attach_mtd_dev(const char *mtd_dev, int vid_hdr_offset,
+                          int data_offset)
+{
+        struct ubi_device *ubi;
+        struct mtd_info *mtd;
+        int i, err;
+        mtd = get_mtd_device_nm(mtd_dev);
+        if (IS_ERR(mtd)) {
+                int mtd_num;
+                char *endp;
+                if (PTR_ERR(mtd) != -ENODEV)
+                        return PTR_ERR(mtd);
+                /*
+                 * Probably this is not MTD device name but MTD device number -
+                 * check this out.
+                 */
+                mtd_num = simple_strtoul(mtd_dev, &endp, 0);
+                if (*endp != '\0' || mtd_dev == endp) {
+                        ubi_err("incorrect MTD device: \"%s\"", mtd_dev);
+                        return -ENODEV;
+                }
+                mtd = get_mtd_device(NULL, mtd_num);
+                if (IS_ERR(mtd))
+                        return PTR_ERR(mtd);
+        }
+        /* Check if we already have the same MTD device attached */
+        for (i = 0; i < ubi_devices_cnt; i++)
+                if (ubi_devices[i]->mtd->index == mtd->index) {
+                        ubi_err("mtd%d is already attached to ubi%d",
+                                mtd->index, i);
+                        err = -EINVAL;
+                        goto out_mtd;
+                }
+        ubi = ubi_devices[ubi_devices_cnt] = kzalloc(sizeof(struct ubi_device),
+                                                      GFP_KERNEL);
+        if (!ubi) {
+                err = -ENOMEM;
+                goto out_mtd;
+        }
+        ubi->ubi_num = ubi_devices_cnt;
+        ubi->mtd = mtd;
+        dbg_msg("attaching mtd%d to ubi%d: VID header offset %d data offset %d",
+                ubi->mtd->index, ubi_devices_cnt, vid_hdr_offset, data_offset);
+        ubi->vid_hdr_offset = vid_hdr_offset;
+        ubi->leb_start = data_offset;
+        err = io_init(ubi);
+        if (err)
+                goto out_free;
+        err = attach_by_scanning(ubi);
+        if (err) {
+                dbg_err("failed to attach by scanning, error %d", err);
+                goto out_free;
+        }
+        err = uif_init(ubi);
+        if (err)
+                goto out_detach;
+        ubi_devices_cnt += 1;
+        ubi_msg("attached mtd%d to ubi%d", ubi->mtd->index, ubi_devices_cnt);
+        ubi_msg("MTD device name:            \"%s\"", ubi->mtd->name);
+        ubi_msg("MTD device size:            %llu MiB", ubi->flash_size >> 20);
+        ubi_msg("physical eraseblock size:   %d bytes (%d KiB)",
+                ubi->peb_size, ubi->peb_size >> 10);
+        ubi_msg("logical eraseblock size:    %d bytes", ubi->leb_size);
+        ubi_msg("number of good PEBs:        %d", ubi->good_peb_count);
+        ubi_msg("number of bad PEBs:         %d", ubi->bad_peb_count);
+        ubi_msg("smallest flash I/O unit:    %d", ubi->min_io_size);
+        ubi_msg("VID header offset:          %d (aligned %d)",
+                ubi->vid_hdr_offset, ubi->vid_hdr_aloffset);
+        ubi_msg("data offset:                %d", ubi->leb_start);
+        ubi_msg("max. allowed volumes:       %d", ubi->vtbl_slots);
+        ubi_msg("wear-leveling threshold:    %d", CONFIG_MTD_UBI_WL_THRESHOLD);
+        ubi_msg("number of internal volumes: %d", UBI_INT_VOL_COUNT);
+        ubi_msg("number of user volumes:     %d",
+                ubi->vol_count - UBI_INT_VOL_COUNT);
+        ubi_msg("available PEBs:             %d", ubi->avail_pebs);
+        ubi_msg("total number of reserved PEBs: %d", ubi->rsvd_pebs);
+        ubi_msg("number of PEBs reserved for bad PEB handling: %d",
+                ubi->beb_rsvd_pebs);
+        ubi_msg("max/mean erase counter: %d/%d", ubi->max_ec, ubi->mean_ec);
+        /* Enable the background thread */
+        if (!DBG_DISABLE_BGT) {
+                ubi->thread_enabled = 1;
+                wake_up_process(ubi->bgt_thread);
+        }
+        return 0;
+out_detach:
+        ubi_eba_close(ubi);
+        ubi_wl_close(ubi);
+        kfree(ubi->vtbl);
+out_free:
+        kfree(ubi);
+out_mtd:
+        put_mtd_device(mtd);
+        ubi_devices[ubi_devices_cnt] = NULL;
+        return err;
+}
+/**
+ * detach_mtd_dev - detach an MTD device.
+ * @ubi: UBI device description object
+ */
+static void detach_mtd_dev(struct ubi_device *ubi)
+{
+        int ubi_num = ubi->ubi_num, mtd_num = ubi->mtd->index;
+        dbg_msg("detaching mtd%d from ubi%d", ubi->mtd->index, ubi_num);
+        uif_close(ubi);
+        ubi_eba_close(ubi);
+        ubi_wl_close(ubi);
+        kfree(ubi->vtbl);
+        put_mtd_device(ubi->mtd);
+        kfree(ubi_devices[ubi_num]);
+        ubi_devices[ubi_num] = NULL;
+        ubi_devices_cnt -= 1;
+        ubi_assert(ubi_devices_cnt >= 0);
+        ubi_msg("mtd%d is detached from ubi%d", mtd_num, ubi_num);
+}
+static int __init ubi_init(void)
+{
+        int err, i, k;
+        /* Ensure that EC and VID headers have correct size */
+        BUILD_BUG_ON(sizeof(struct ubi_ec_hdr) != 64);
+        BUILD_BUG_ON(sizeof(struct ubi_vid_hdr) != 64);
+        if (mtd_devs > UBI_MAX_DEVICES) {
+                printk("UBI error: too many MTD devices, maximum is %d\n",
+                       UBI_MAX_DEVICES);
+                return -EINVAL;
+        }
+        ubi_class = class_create(THIS_MODULE, UBI_NAME_STR);
+        if (IS_ERR(ubi_class))
+                return PTR_ERR(ubi_class);
+        err = class_create_file(ubi_class, &ubi_version);
+        if (err)
+                goto out_class;
+        /* Attach MTD devices */
+        for (i = 0; i < mtd_devs; i++) {
+                struct mtd_dev_param *p = &mtd_dev_param[i];
+                cond_resched();
+                if (!p->name) {
+                        dbg_err("empty name");
+                        err = -EINVAL;
+                        goto out_detach;
+                }
+                err = attach_mtd_dev(p->name, p->vid_hdr_offs, p->data_offs);
+                if (err)
+                        goto out_detach;
+        }
+        return 0;
+out_detach:
+        for (k = 0; k < i; k++)
+                detach_mtd_dev(ubi_devices[k]);
+        class_remove_file(ubi_class, &ubi_version);
+out_class:
+        class_destroy(ubi_class);
+        return err;
+}
+module_init(ubi_init);
+static void __exit ubi_exit(void)
+{
+        int i, n = ubi_devices_cnt;
+        for (i = 0; i < n; i++)
+                detach_mtd_dev(ubi_devices[i]);
+        class_remove_file(ubi_class, &ubi_version);
+        class_destroy(ubi_class);
+}
+module_exit(ubi_exit);
+/**
+ * bytes_str_to_int - convert a string representing number of bytes to an
+ * integer.
+ * @str: the string to convert
+ *
+ * This function returns positive resulting integer in case of success and a
+ * negative error code in case of failure.
+ */
+static int __init bytes_str_to_int(const char *str)
+{
+        char *endp;
+        unsigned long result;
+        result = simple_strtoul(str, &endp, 0);
+        if (str == endp || result < 0) {
+                printk("UBI error: incorrect bytes count: \"%s\"\n", str);
+                return -EINVAL;
+        }
+        switch (*endp) {
+        case 'G':
+                result *= 1024;
+        case 'M':
+                result *= 1024;
+        case 'K':
+        case 'k':
+                result *= 1024;
+                if (endp[1] == 'i' && (endp[2] == '\0' ||
+                          endp[2] == 'B'  || endp[2] == 'b'))
+                        endp += 2;
+        case '\0':
+                break;
+        default:
+                printk("UBI error: incorrect bytes count: \"%s\"\n", str);
+                return -EINVAL;
+        }
+        return result;
+}
+/**
+ * ubi_mtd_param_parse - parse the 'mtd=' UBI parameter.
+ * @val: the parameter value to parse
+ * @kp: not used
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of error.
+ */
+static int __init ubi_mtd_param_parse(const char *val, struct kernel_param *kp)
+{
+        int i, len;
+        struct mtd_dev_param *p;
+        char buf[MTD_PARAM_LEN_MAX];
+        char *pbuf = &buf[0];
+        char *tokens[3] = {NULL, NULL, NULL};
+        if (mtd_devs == UBI_MAX_DEVICES) {
+                printk("UBI error: too many parameters, max. is %d\n",
+                       UBI_MAX_DEVICES);
+                return -EINVAL;
+        }
+        len = strnlen(val, MTD_PARAM_LEN_MAX);
+        if (len == MTD_PARAM_LEN_MAX) {
+                printk("UBI error: parameter \"%s\" is too long, max. is %d\n",
+                       val, MTD_PARAM_LEN_MAX);
+                return -EINVAL;
+        }
+        if (len == 0) {
+                printk("UBI warning: empty 'mtd=' parameter - ignored\n");
+                return 0;
+        }
+        strcpy(buf, val);
+        /* Get rid of the final newline */
+        if (buf[len - 1] == '\n')
+                buf[len - 1] = 0;
+        for (i = 0; i < 3; i++)
+                tokens[i] = strsep(&pbuf, ",");
+        if (pbuf) {
+                printk("UBI error: too many arguments at \"%s\"\n", val);
+                return -EINVAL;
+        }
+        if (tokens[0] == '\0')
+                return -EINVAL;
+        p = &mtd_dev_param[mtd_devs];
+        strcpy(&p->name[0], tokens[0]);
+        if (tokens[1])
+                p->vid_hdr_offs = bytes_str_to_int(tokens[1]);
+        if (tokens[2])
+                p->data_offs = bytes_str_to_int(tokens[2]);
+        if (p->vid_hdr_offs < 0)
+                return p->vid_hdr_offs;
+        if (p->data_offs < 0)
+                return p->data_offs;
+        mtd_devs += 1;
+        return 0;
+}
+module_param_call(mtd, ubi_mtd_param_parse, NULL, NULL, 000);
+MODULE_PARM_DESC(mtd, "MTD devices to attach. Parameter format: "
+                      "mtd=<name|num>[,<vid_hdr_offs>,<data_offs>]. "
+                      "Multiple \"mtd\" parameters may be specified.\n"
+                      "MTD devices may be specified by their number or name. "
+                      "Optional \"vid_hdr_offs\" and \"data_offs\" parameters "
+                      "specify UBI VID header position and data starting "
+                      "position to be used by UBI.\n"
+                      "Example: mtd=content,1984,2048 mtd=4 - attach MTD device"
+                      "with name content using VID header offset 1984 and data "
+                      "start 2048, and MTD device number 4 using default "
+                      "offsets");
+MODULE_VERSION(__stringify(UBI_VERSION));
+MODULE_DESCRIPTION("UBI - Unsorted Block Images");
+MODULE_AUTHOR("Artem Bityutskiy");
+MODULE_LICENSE("GPL");
diff --git a/drivers/mtd/ubi/cdev.c b/drivers/mtd/ubi/cdev.c
new file mode 100644
index 000000000000..6612eb79bf17
--- /dev/null
+++ b/drivers/mtd/ubi/cdev.c
@@ -0,0 +1,722 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Artem Bityutskiy (Битюцкий Артём)
+ */
+/*
+ * This file includes implementation of UBI character device operations.
+ *
+ * There are two kinds of character devices in UBI: UBI character devices and
+ * UBI volume character devices. UBI character devices allow users to
+ * manipulate whole volumes: create, remove, and re-size them. Volume character
+ * devices provide volume I/O capabilities.
+ *
+ * Major and minor numbers are assigned dynamically to both UBI and volume
+ * character devices.
+ */
+#include <linux/module.h>
+#include <linux/stat.h>
+#include <linux/ioctl.h>
+#include <linux/capability.h>
+#include <mtd/ubi-user.h>
+#include <asm/uaccess.h>
+#include <asm/div64.h>
+#include "ubi.h"
+/*
+ * Maximum sequence numbers of UBI and volume character device IOCTLs (direct
+ * logical eraseblock erase is a debug-only feature).
+ */
+#define UBI_CDEV_IOC_MAX_SEQ 2
+#ifndef CONFIG_MTD_UBI_DEBUG_USERSPACE_IO
+#define VOL_CDEV_IOC_MAX_SEQ 1
+#else
+#define VOL_CDEV_IOC_MAX_SEQ 2
+#endif
+/**
+ * major_to_device - get UBI device object by character device major number.
+ * @major: major number
+ *
+ * This function returns a pointer to the UBI device object.
+ */
+static struct ubi_device *major_to_device(int major)
+{
+        int i;
+        for (i = 0; i < ubi_devices_cnt; i++)
+                if (ubi_devices[i] && ubi_devices[i]->major == major)
+                        return ubi_devices[i];
+        BUG();
+}
+/**
+ * get_exclusive - get exclusive access to an UBI volume.
+ * @desc: volume descriptor
+ *
+ * This function changes UBI volume open mode to "exclusive". Returns previous
+ * mode value (positive integer) in case of success and a negative error code
+ * in case of failure.
+ */
+static int get_exclusive(struct ubi_volume_desc *desc)
+{
+        int users, err;
+        struct ubi_volume *vol = desc->vol;
+        spin_lock(&vol->ubi->volumes_lock);
+        users = vol->readers + vol->writers + vol->exclusive;
+        ubi_assert(users > 0);
+        if (users > 1) {
+                dbg_err("%d users for volume %d", users, vol->vol_id);
+                err = -EBUSY;
+        } else {
+                vol->readers = vol->writers = 0;
+                vol->exclusive = 1;
+                err = desc->mode;
+                desc->mode = UBI_EXCLUSIVE;
+        }
+        spin_unlock(&vol->ubi->volumes_lock);
+        return err;
+}
+/**
+ * revoke_exclusive - revoke exclusive mode.
+ * @desc: volume descriptor
+ * @mode: new mode to switch to
+ */
+static void revoke_exclusive(struct ubi_volume_desc *desc, int mode)
+{
+        struct ubi_volume *vol = desc->vol;
+        spin_lock(&vol->ubi->volumes_lock);
+        ubi_assert(vol->readers == 0 && vol->writers == 0);
+        ubi_assert(vol->exclusive == 1 && desc->mode == UBI_EXCLUSIVE);
+        vol->exclusive = 0;
+        if (mode == UBI_READONLY)
+                vol->readers = 1;
+        else if (mode == UBI_READWRITE)
+                vol->writers = 1;
+        else
+                vol->exclusive = 1;
+        spin_unlock(&vol->ubi->volumes_lock);
+        desc->mode = mode;
+}
+static int vol_cdev_open(struct inode *inode, struct file *file)
+{
+        struct ubi_volume_desc *desc;
+        const struct ubi_device *ubi = major_to_device(imajor(inode));
+        int vol_id = iminor(inode) - 1;
+        int mode;
+        if (file->f_mode & FMODE_WRITE)
+                mode = UBI_READWRITE;
+        else
+                mode = UBI_READONLY;
+        dbg_msg("open volume %d, mode %d", vol_id, mode);
+        desc = ubi_open_volume(ubi->ubi_num, vol_id, mode);
+        if (IS_ERR(desc))
+                return PTR_ERR(desc);
+        file->private_data = desc;
+        return 0;
+}
+static int vol_cdev_release(struct inode *inode, struct file *file)
+{
+        struct ubi_volume_desc *desc = file->private_data;
+        struct ubi_volume *vol = desc->vol;
+        dbg_msg("release volume %d, mode %d", vol->vol_id, desc->mode);
+        if (vol->updating) {
+                ubi_warn("update of volume %d not finished, volume is damaged",
+                         vol->vol_id);
+                vol->updating = 0;
+                kfree(vol->upd_buf);
+        }
+        ubi_close_volume(desc);
+        return 0;
+}
+static loff_t vol_cdev_llseek(struct file *file, loff_t offset, int origin)
+{
+        struct ubi_volume_desc *desc = file->private_data;
+        struct ubi_volume *vol = desc->vol;
+        loff_t new_offset;
+        if (vol->updating) {
+                 /* Update is in progress, seeking is prohibited */
+                dbg_err("updating");
+                return -EBUSY;
+        }
+        switch (origin) {
+        case 0: /* SEEK_SET */
+                new_offset = offset;
+                break;
+        case 1: /* SEEK_CUR */
+                new_offset = file->f_pos + offset;
+                break;
+        case 2: /* SEEK_END */
+                new_offset = vol->used_bytes + offset;
+                break;
+        default:
+                return -EINVAL;
+        }
+        if (new_offset < 0 || new_offset > vol->used_bytes) {
+                dbg_err("bad seek %lld", new_offset);
+                return -EINVAL;
+        }
+        dbg_msg("seek volume %d, offset %lld, origin %d, new offset %lld",
+                vol->vol_id, offset, origin, new_offset);
+        file->f_pos = new_offset;
+        return new_offset;
+}
+static ssize_t vol_cdev_read(struct file *file, __user char *buf, size_t count,
+                             loff_t *offp)
+{
+        struct ubi_volume_desc *desc = file->private_data;
+        struct ubi_volume *vol = desc->vol;
+        struct ubi_device *ubi = vol->ubi;
+        int err, lnum, off, len,  vol_id = desc->vol->vol_id, tbuf_size;
+        size_t count_save = count;
+        void *tbuf;
+        uint64_t tmp;
+        dbg_msg("read %zd bytes from offset %lld of volume %d",
+                count, *offp, vol_id);
+        if (vol->updating) {
+                dbg_err("updating");
+                return -EBUSY;
+        }
+        if (vol->upd_marker) {
+                dbg_err("damaged volume, update marker is set");
+                return -EBADF;
+        }
+        if (*offp == vol->used_bytes || count == 0)
+                return 0;
+        if (vol->corrupted)
+                dbg_msg("read from corrupted volume %d", vol_id);
+        if (*offp + count > vol->used_bytes)
+                count_save = count = vol->used_bytes - *offp;
+        tbuf_size = vol->usable_leb_size;
+        if (count < tbuf_size)
+                tbuf_size = ALIGN(count, ubi->min_io_size);
+        tbuf = kmalloc(tbuf_size, GFP_KERNEL);
+        if (!tbuf)
+                return -ENOMEM;
+        len = count > tbuf_size ? tbuf_size : count;
+        tmp = *offp;
+        off = do_div(tmp, vol->usable_leb_size);
+        lnum = tmp;
+        do {
+                cond_resched();
+                if (off + len >= vol->usable_leb_size)
+                        len = vol->usable_leb_size - off;
+                err = ubi_eba_read_leb(ubi, vol_id, lnum, tbuf, off, len, 0);
+                if (err)
+                        break;
+                off += len;
+                if (off == vol->usable_leb_size) {
+                        lnum += 1;
+                        off -= vol->usable_leb_size;
+                }
+                count -= len;
+                *offp += len;
+                err = copy_to_user(buf, tbuf, len);
+                if (err) {
+                        err = -EFAULT;
+                        break;
+                }
+                buf += len;
+                len = count > tbuf_size ? tbuf_size : count;
+        } while (count);
+        kfree(tbuf);
+        return err ? err : count_save - count;
+}
+#ifdef CONFIG_MTD_UBI_DEBUG_USERSPACE_IO
+/*
+ * This function allows to directly write to dynamic UBI volumes, without
+ * issuing the volume update operation. Available only as a debugging feature.
+ * Very useful for testing UBI.
+ */
+static ssize_t vol_cdev_direct_write(struct file *file, const char __user *buf,
+                                     size_t count, loff_t *offp)
+{
+        struct ubi_volume_desc *desc = file->private_data;
+        struct ubi_volume *vol = desc->vol;
+        struct ubi_device *ubi = vol->ubi;
+        int lnum, off, len, tbuf_size, vol_id = vol->vol_id, err = 0;
+        size_t count_save = count;
+        char *tbuf;
+        uint64_t tmp;
+        dbg_msg("requested: write %zd bytes to offset %lld of volume %u",
+                count, *offp, desc->vol->vol_id);
+        if (vol->vol_type == UBI_STATIC_VOLUME)
+                return -EROFS;
+        tmp = *offp;
+        off = do_div(tmp, vol->usable_leb_size);
+        lnum = tmp;
+        if (off % ubi->min_io_size) {
+                dbg_err("unaligned position");
+                return -EINVAL;
+        }
+        if (*offp + count > vol->used_bytes)
+                count_save = count = vol->used_bytes - *offp;
+        /* We can write only in fractions of the minimum I/O unit */
+        if (count % ubi->min_io_size) {
+                dbg_err("unaligned write length");
+                return -EINVAL;
+        }
+        tbuf_size = vol->usable_leb_size;
+        if (count < tbuf_size)
+                tbuf_size = ALIGN(count, ubi->min_io_size);
+        tbuf = kmalloc(tbuf_size, GFP_KERNEL);
+        if (!tbuf)
+                return -ENOMEM;
+        len = count > tbuf_size ? tbuf_size : count;
+        while (count) {
+                cond_resched();
+                if (off + len >= vol->usable_leb_size)
+                        len = vol->usable_leb_size - off;
+                err = copy_from_user(tbuf, buf, len);
+                if (err) {
+                        err = -EFAULT;
+                        break;
+                }
+                err = ubi_eba_write_leb(ubi, vol_id, lnum, tbuf, off, len,
+                                        UBI_UNKNOWN);
+                if (err)
+                        break;
+                off += len;
+                if (off == vol->usable_leb_size) {
+                        lnum += 1;
+                        off -= vol->usable_leb_size;
+                }
+                count -= len;
+                *offp += len;
+                buf += len;
+                len = count > tbuf_size ? tbuf_size : count;
+        }
+        kfree(tbuf);
+        return err ? err : count_save - count;
+}
+#else
+#define vol_cdev_direct_write(file, buf, count, offp) -EPERM
+#endif /* CONFIG_MTD_UBI_DEBUG_USERSPACE_IO */
+static ssize_t vol_cdev_write(struct file *file, const char __user *buf,
+                              size_t count, loff_t *offp)
+{
+        int err = 0;
+        struct ubi_volume_desc *desc = file->private_data;
+        struct ubi_volume *vol = desc->vol;
+        struct ubi_device *ubi = vol->ubi;
+        if (!vol->updating)
+                return vol_cdev_direct_write(file, buf, count, offp);
+        err = ubi_more_update_data(ubi, vol->vol_id, buf, count);
+        if (err < 0) {
+                ubi_err("cannot write %zd bytes of update data", count);
+                return err;
+        }
+        if (err) {
+                /*
+                 * Update is finished, @err contains number of actually written
+                 * bytes now.
+                 */
+                count = err;
+                err = ubi_check_volume(ubi, vol->vol_id);
+                if (err < 0)
+                        return err;
+                if (err) {
+                        ubi_warn("volume %d on UBI device %d is corrupted",
+                                 vol->vol_id, ubi->ubi_num);
+                        vol->corrupted = 1;
+                }
+                vol->checked = 1;
+                revoke_exclusive(desc, UBI_READWRITE);
+        }
+        *offp += count;
+        return count;
+}
+static int vol_cdev_ioctl(struct inode *inode, struct file *file,
+                          unsigned int cmd, unsigned long arg)
+{
+        int err = 0;
+        struct ubi_volume_desc *desc = file->private_data;
+        struct ubi_volume *vol = desc->vol;
+        struct ubi_device *ubi = vol->ubi;
+        void __user *argp = (void __user *)arg;
+        if (_IOC_NR(cmd) > VOL_CDEV_IOC_MAX_SEQ ||
+            _IOC_TYPE(cmd) != UBI_VOL_IOC_MAGIC)
+                return -ENOTTY;
+        if (_IOC_DIR(cmd) && _IOC_READ)
+                err = !access_ok(VERIFY_WRITE, argp, _IOC_SIZE(cmd));
+        else if (_IOC_DIR(cmd) && _IOC_WRITE)
+                err = !access_ok(VERIFY_READ, argp, _IOC_SIZE(cmd));
+        if (err)
+                return -EFAULT;
+        switch (cmd) {
+        /* Volume update command */
+        case UBI_IOCVOLUP:
+        {
+                int64_t bytes, rsvd_bytes;
+                if (!capable(CAP_SYS_RESOURCE)) {
+                        err = -EPERM;
+                        break;
+                }
+                err = copy_from_user(&bytes, argp, sizeof(int64_t));
+                if (err) {
+                        err = -EFAULT;
+                        break;
+                }
+                if (desc->mode == UBI_READONLY) {
+                        err = -EROFS;
+                        break;
+                }
+                rsvd_bytes = vol->reserved_pebs * (ubi->leb_size-vol->data_pad);
+                if (bytes < 0 || bytes > rsvd_bytes) {
+                        err = -EINVAL;
+                        break;
+                }
+                err = get_exclusive(desc);
+                if (err < 0)
+                        break;
+                err = ubi_start_update(ubi, vol->vol_id, bytes);
+                if (bytes == 0)
+                        revoke_exclusive(desc, UBI_READWRITE);
+                file->f_pos = 0;
+                break;
+        }
+#ifdef CONFIG_MTD_UBI_DEBUG_USERSPACE_IO
+        /* Logical eraseblock erasure command */
+        case UBI_IOCEBER:
+        {
+                int32_t lnum;
+                err = __get_user(lnum, (__user int32_t *)argp);
+                if (err) {
+                        err = -EFAULT;
+                        break;
+                }
+                if (desc->mode == UBI_READONLY) {
+                        err = -EROFS;
+                        break;
+                }
+                if (lnum < 0 || lnum >= vol->reserved_pebs) {
+                        err = -EINVAL;
+                        break;
+                }
+                if (vol->vol_type != UBI_DYNAMIC_VOLUME) {
+                        err = -EROFS;
+                        break;
+                }
+                dbg_msg("erase LEB %d:%d", vol->vol_id, lnum);
+                err = ubi_eba_unmap_leb(ubi, vol->vol_id, lnum);
+                if (err)
+                        break;
+                err = ubi_wl_flush(ubi);
+                break;
+        }
+#endif
+        default:
+                err = -ENOTTY;
+                break;
+        }
+        return err;
+}
+/**
+ * verify_mkvol_req - verify volume creation request.
+ * @ubi: UBI device description object
+ * @req: the request to check
+ *
+ * This function zero if the request is correct, and %-EINVAL if not.
+ */
+static int verify_mkvol_req(const struct ubi_device *ubi,
+                            const struct ubi_mkvol_req *req)
+{
+        int n, err = -EINVAL;
+        if (req->bytes < 0 || req->alignment < 0 || req->vol_type < 0 ||
+            req->name_len < 0)
+                goto bad;
+        if ((req->vol_id < 0 || req->vol_id >= ubi->vtbl_slots) &&
+            req->vol_id != UBI_VOL_NUM_AUTO)
+                goto bad;
+        if (req->alignment == 0)
+                goto bad;
+        if (req->bytes == 0)
+                goto bad;
+        if (req->vol_type != UBI_DYNAMIC_VOLUME &&
+            req->vol_type != UBI_STATIC_VOLUME)
+                goto bad;
+        if (req->alignment > ubi->leb_size)
+                goto bad;
+        n = req->alignment % ubi->min_io_size;
+        if (req->alignment != 1 && n)
+                goto bad;
+        if (req->name_len > UBI_VOL_NAME_MAX) {
+                err = -ENAMETOOLONG;
+                goto bad;
+        }
+        return 0;
+bad:
+        dbg_err("bad volume creation request");
+        ubi_dbg_dump_mkvol_req(req);
+        return err;
+}
+/**
+ * verify_rsvol_req - verify volume re-size request.
+ * @ubi: UBI device description object
+ * @req: the request to check
+ *
+ * This function returns zero if the request is correct, and %-EINVAL if not.
+ */
+static int verify_rsvol_req(const struct ubi_device *ubi,
+                            const struct ubi_rsvol_req *req)
+{
+        if (req->bytes <= 0)
+                return -EINVAL;
+        if (req->vol_id < 0 || req->vol_id >= ubi->vtbl_slots)
+                return -EINVAL;
+        return 0;
+}
+static int ubi_cdev_ioctl(struct inode *inode, struct file *file,
+                          unsigned int cmd, unsigned long arg)
+{
+        int err = 0;
+        struct ubi_device *ubi;
+        struct ubi_volume_desc *desc;
+        void __user *argp = (void __user *)arg;
+        if (_IOC_NR(cmd) > UBI_CDEV_IOC_MAX_SEQ ||
+            _IOC_TYPE(cmd) != UBI_IOC_MAGIC)
+                return -ENOTTY;
+        if (_IOC_DIR(cmd) && _IOC_READ)
+                err = !access_ok(VERIFY_WRITE, argp, _IOC_SIZE(cmd));
+        else if (_IOC_DIR(cmd) && _IOC_WRITE)
+                err = !access_ok(VERIFY_READ, argp, _IOC_SIZE(cmd));
+        if (err)
+                return -EFAULT;
+        if (!capable(CAP_SYS_RESOURCE))
+                return -EPERM;
+        ubi = major_to_device(imajor(inode));
+        if (IS_ERR(ubi))
+                return PTR_ERR(ubi);
+        switch (cmd) {
+        /* Create volume command */
+        case UBI_IOCMKVOL:
+        {
+                struct ubi_mkvol_req req;
+                dbg_msg("create volume");
+                err = __copy_from_user(&req, argp,
+                                       sizeof(struct ubi_mkvol_req));
+                if (err) {
+                        err = -EFAULT;
+                        break;
+                }
+                err = verify_mkvol_req(ubi, &req);
+                if (err)
+                        break;
+                req.name[req.name_len] = '\0';
+                err = ubi_create_volume(ubi, &req);
+                if (err)
+                        break;
+                err = __put_user(req.vol_id, (__user int32_t *)argp);
+                if (err)
+                        err = -EFAULT;
+                break;
+        }
+        /* Remove volume command */
+        case UBI_IOCRMVOL:
+        {
+                int vol_id;
+                dbg_msg("remove volume");
+                err = __get_user(vol_id, (__user int32_t *)argp);
+                if (err) {
+                        err = -EFAULT;
+                        break;
+                }
+                desc = ubi_open_volume(ubi->ubi_num, vol_id, UBI_EXCLUSIVE);
+                if (IS_ERR(desc)) {
+                        err = PTR_ERR(desc);
+                        break;
+                }
+                err = ubi_remove_volume(desc);
+                if (err)
+                        ubi_close_volume(desc);
+                break;
+        }
+        /* Re-size volume command */
+        case UBI_IOCRSVOL:
+        {
+                int pebs;
+                uint64_t tmp;
+                struct ubi_rsvol_req req;
+                dbg_msg("re-size volume");
+                err = __copy_from_user(&req, argp,
+                                       sizeof(struct ubi_rsvol_req));
+                if (err) {
+                        err = -EFAULT;
+                        break;
+                }
+                err = verify_rsvol_req(ubi, &req);
+                if (err)
+                        break;
+                desc = ubi_open_volume(ubi->ubi_num, req.vol_id, UBI_EXCLUSIVE);
+                if (IS_ERR(desc)) {
+                        err = PTR_ERR(desc);
+                        break;
+                }
+                tmp = req.bytes;
+                pebs = !!do_div(tmp, desc->vol->usable_leb_size);
+                pebs += tmp;
+                err = ubi_resize_volume(desc, pebs);
+                ubi_close_volume(desc);
+                break;
+        }
+        default:
+                err = -ENOTTY;
+                break;
+        }
+        return err;
+}
+/* UBI character device operations */
+struct file_operations ubi_cdev_operations = {
+        .owner = THIS_MODULE,
+        .ioctl = ubi_cdev_ioctl,
+        .llseek = no_llseek
+};
+/* UBI volume character device operations */
+struct file_operations ubi_vol_cdev_operations = {
+        .owner   = THIS_MODULE,
+        .open    = vol_cdev_open,
+        .release = vol_cdev_release,
+        .llseek  = vol_cdev_llseek,
+        .read    = vol_cdev_read,
+        .write   = vol_cdev_write,
+        .ioctl   = vol_cdev_ioctl
+};
diff --git a/drivers/mtd/ubi/debug.c b/drivers/mtd/ubi/debug.c
new file mode 100644
index 000000000000..86364221fafe
--- /dev/null
+++ b/drivers/mtd/ubi/debug.c
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Artem Bityutskiy (Битюцкий Артём)
+ */
+/*
+ * Here we keep all the UBI debugging stuff which should normally be disabled
+ * and compiled-out, but it is extremely helpful when hunting bugs or doing big
+ * changes.
+ */
+#ifdef CONFIG_MTD_UBI_DEBUG_MSG
+#include "ubi.h"
+/**
+ * ubi_dbg_dump_ec_hdr - dump an erase counter header.
+ * @ec_hdr: the erase counter header to dump
+ */
+void ubi_dbg_dump_ec_hdr(const struct ubi_ec_hdr *ec_hdr)
+{
+        dbg_msg("erase counter header dump:");
+        dbg_msg("magic          %#08x", ubi32_to_cpu(ec_hdr->magic));
+        dbg_msg("version        %d",    (int)ec_hdr->version);
+        dbg_msg("ec             %llu",  (long long)ubi64_to_cpu(ec_hdr->ec));
+        dbg_msg("vid_hdr_offset %d",    ubi32_to_cpu(ec_hdr->vid_hdr_offset));
+        dbg_msg("data_offset    %d",    ubi32_to_cpu(ec_hdr->data_offset));
+        dbg_msg("hdr_crc        %#08x", ubi32_to_cpu(ec_hdr->hdr_crc));
+        dbg_msg("erase counter header hexdump:");
+        ubi_dbg_hexdump(ec_hdr, UBI_EC_HDR_SIZE);
+}
+/**
+ * ubi_dbg_dump_vid_hdr - dump a volume identifier header.
+ * @vid_hdr: the volume identifier header to dump
+ */
+void ubi_dbg_dump_vid_hdr(const struct ubi_vid_hdr *vid_hdr)
+{
+        dbg_msg("volume identifier header dump:");
+        dbg_msg("magic     %08x", ubi32_to_cpu(vid_hdr->magic));
+        dbg_msg("version   %d",   (int)vid_hdr->version);
+        dbg_msg("vol_type  %d",   (int)vid_hdr->vol_type);
+        dbg_msg("copy_flag %d",   (int)vid_hdr->copy_flag);
+        dbg_msg("compat    %d",   (int)vid_hdr->compat);
+        dbg_msg("vol_id    %d",   ubi32_to_cpu(vid_hdr->vol_id));
+        dbg_msg("lnum      %d",   ubi32_to_cpu(vid_hdr->lnum));
+        dbg_msg("leb_ver   %u",   ubi32_to_cpu(vid_hdr->leb_ver));
+        dbg_msg("data_size %d",   ubi32_to_cpu(vid_hdr->data_size));
+        dbg_msg("used_ebs  %d",   ubi32_to_cpu(vid_hdr->used_ebs));
+        dbg_msg("data_pad  %d",   ubi32_to_cpu(vid_hdr->data_pad));
+        dbg_msg("sqnum     %llu",
+                (unsigned long long)ubi64_to_cpu(vid_hdr->sqnum));
+        dbg_msg("hdr_crc   %08x", ubi32_to_cpu(vid_hdr->hdr_crc));
+        dbg_msg("volume identifier header hexdump:");
+}
+/**
+ * ubi_dbg_dump_vol_info- dump volume information.
+ * @vol: UBI volume description object
+ */
+void ubi_dbg_dump_vol_info(const struct ubi_volume *vol)
+{
+        dbg_msg("volume information dump:");
+        dbg_msg("vol_id          %d", vol->vol_id);
+        dbg_msg("reserved_pebs   %d", vol->reserved_pebs);
+        dbg_msg("alignment       %d", vol->alignment);
+        dbg_msg("data_pad        %d", vol->data_pad);
+        dbg_msg("vol_type        %d", vol->vol_type);
+        dbg_msg("name_len        %d", vol->name_len);
+        dbg_msg("usable_leb_size %d", vol->usable_leb_size);
+        dbg_msg("used_ebs        %d", vol->used_ebs);
+        dbg_msg("used_bytes      %lld", vol->used_bytes);
+        dbg_msg("last_eb_bytes   %d", vol->last_eb_bytes);
+        dbg_msg("corrupted       %d", vol->corrupted);
+        dbg_msg("upd_marker      %d", vol->upd_marker);
+        if (vol->name_len <= UBI_VOL_NAME_MAX &&
+            strnlen(vol->name, vol->name_len + 1) == vol->name_len) {
+                dbg_msg("name          %s", vol->name);
+        } else {
+                dbg_msg("the 1st 5 characters of the name: %c%c%c%c%c",
+                        vol->name[0], vol->name[1], vol->name[2],
+                        vol->name[3], vol->name[4]);
+        }
+}
+/**
+ * ubi_dbg_dump_vtbl_record - dump a &struct ubi_vtbl_record object.
+ * @r: the object to dump
+ * @idx: volume table index
+ */
+void ubi_dbg_dump_vtbl_record(const struct ubi_vtbl_record *r, int idx)
+{
+        int name_len = ubi16_to_cpu(r->name_len);
+        dbg_msg("volume table record %d dump:", idx);
+        dbg_msg("reserved_pebs   %d", ubi32_to_cpu(r->reserved_pebs));
+        dbg_msg("alignment       %d", ubi32_to_cpu(r->alignment));
+        dbg_msg("data_pad        %d", ubi32_to_cpu(r->data_pad));
+        dbg_msg("vol_type        %d", (int)r->vol_type);
+        dbg_msg("upd_marker      %d", (int)r->upd_marker);
+        dbg_msg("name_len        %d", name_len);
+        if (r->name[0] == '\0') {
+                dbg_msg("name          NULL");
+                return;
+        }
+        if (name_len <= UBI_VOL_NAME_MAX &&
+            strnlen(&r->name[0], name_len + 1) == name_len) {
+                dbg_msg("name          %s", &r->name[0]);
+        } else {
+                dbg_msg("1st 5 characters of the name: %c%c%c%c%c",
+                        r->name[0], r->name[1], r->name[2], r->name[3],
+                        r->name[4]);
+        }
+        dbg_msg("crc             %#08x", ubi32_to_cpu(r->crc));
+}
+/**
+ * ubi_dbg_dump_sv - dump a &struct ubi_scan_volume object.
+ * @sv: the object to dump
+ */
+void ubi_dbg_dump_sv(const struct ubi_scan_volume *sv)
+{
+        dbg_msg("volume scanning information dump:");
+        dbg_msg("vol_id         %d", sv->vol_id);
+        dbg_msg("highest_lnum   %d", sv->highest_lnum);
+        dbg_msg("leb_count      %d", sv->leb_count);
+        dbg_msg("compat         %d", sv->compat);
+        dbg_msg("vol_type       %d", sv->vol_type);
+        dbg_msg("used_ebs       %d", sv->used_ebs);
+        dbg_msg("last_data_size %d", sv->last_data_size);
+        dbg_msg("data_pad       %d", sv->data_pad);
+}
+/**
+ * ubi_dbg_dump_seb - dump a &struct ubi_scan_leb object.
+ * @seb: the object to dump
+ * @type: object type: 0 - not corrupted, 1 - corrupted
+ */
+void ubi_dbg_dump_seb(const struct ubi_scan_leb *seb, int type)
+{
+        dbg_msg("eraseblock scanning information dump:");
+        dbg_msg("ec       %d", seb->ec);
+        dbg_msg("pnum     %d", seb->pnum);
+        if (type == 0) {
+                dbg_msg("lnum     %d", seb->lnum);
+                dbg_msg("scrub    %d", seb->scrub);
+                dbg_msg("sqnum    %llu", seb->sqnum);
+                dbg_msg("leb_ver  %u", seb->leb_ver);
+        }
+}
+/**
+ * ubi_dbg_dump_mkvol_req - dump a &struct ubi_mkvol_req object.
+ * @req: the object to dump
+ */
+void ubi_dbg_dump_mkvol_req(const struct ubi_mkvol_req *req)
+{
+        char nm[17];
+        dbg_msg("volume creation request dump:");
+        dbg_msg("vol_id    %d",   req->vol_id);
+        dbg_msg("alignment %d",   req->alignment);
+        dbg_msg("bytes     %lld", (long long)req->bytes);
+        dbg_msg("vol_type  %d",   req->vol_type);
+        dbg_msg("name_len  %d",   req->name_len);
+        memcpy(nm, req->name, 16);
+        nm[16] = 0;
+        dbg_msg("the 1st 16 characters of the name: %s", nm);
+}
+#define BYTES_PER_LINE 32
+/**
+ * ubi_dbg_hexdump - dump a buffer.
+ * @ptr: the buffer to dump
+ * @size: buffer size which must be multiple of 4 bytes
+ */
+void ubi_dbg_hexdump(const void *ptr, int size)
+{
+        int i, k = 0, rows, columns;
+        const uint8_t *p = ptr;
+        size = ALIGN(size, 4);
+        rows = size/BYTES_PER_LINE + size % BYTES_PER_LINE;
+        for (i = 0; i < rows; i++) {
+                int j;
+                cond_resched();
+                columns = min(size - k, BYTES_PER_LINE) / 4;
+                if (columns == 0)
+                        break;
+                printk(KERN_DEBUG "%5d:  ", i * BYTES_PER_LINE);
+                for (j = 0; j < columns; j++) {
+                        int n, N;
+                        N = size - k > 4 ? 4 : size - k;
+                        for (n = 0; n < N; n++)
+                                printk("%02x", p[k++]);
+                        printk(" ");
+                }
+                printk("\n");
+        }
+}
+#endif /* CONFIG_MTD_UBI_DEBUG_MSG */
diff --git a/drivers/mtd/ubi/debug.h b/drivers/mtd/ubi/debug.h
new file mode 100644
index 000000000000..f816ad9a36c0
--- /dev/null
+++ b/drivers/mtd/ubi/debug.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Artem Bityutskiy (Битюцкий Артём)
+ */
+#ifndef __UBI_DEBUG_H__
+#define __UBI_DEBUG_H__
+#ifdef CONFIG_MTD_UBI_DEBUG
+#include <linux/random.h>
+#define ubi_assert(expr)  BUG_ON(!(expr))
+#define dbg_err(fmt, ...) ubi_err(fmt, ##__VA_ARGS__)
+#else
+#define ubi_assert(expr)  ({})
+#define dbg_err(fmt, ...) ({})
+#endif
+#ifdef CONFIG_MTD_UBI_DEBUG_DISABLE_BGT
+#define DBG_DISABLE_BGT 1
+#else
+#define DBG_DISABLE_BGT 0
+#endif
+#ifdef CONFIG_MTD_UBI_DEBUG_MSG
+/* Generic debugging message */
+#define dbg_msg(fmt, ...) \
+        printk(KERN_DEBUG "UBI DBG: %s: " fmt "\n", __FUNCTION__, ##__VA_ARGS__)
+#define ubi_dbg_dump_stack() dump_stack()
+struct ubi_ec_hdr;
+struct ubi_vid_hdr;
+struct ubi_volume;
+struct ubi_vtbl_record;
+struct ubi_scan_volume;
+struct ubi_scan_leb;
+struct ubi_mkvol_req;
+void ubi_dbg_print(int type, const char *func, const char *fmt, ...);
+void ubi_dbg_dump_ec_hdr(const struct ubi_ec_hdr *ec_hdr);
+void ubi_dbg_dump_vid_hdr(const struct ubi_vid_hdr *vid_hdr);
+void ubi_dbg_dump_vol_info(const struct ubi_volume *vol);
+void ubi_dbg_dump_vtbl_record(const struct ubi_vtbl_record *r, int idx);
+void ubi_dbg_dump_sv(const struct ubi_scan_volume *sv);
+void ubi_dbg_dump_seb(const struct ubi_scan_leb *seb, int type);
+void ubi_dbg_dump_mkvol_req(const struct ubi_mkvol_req *req);
+void ubi_dbg_hexdump(const void *buf, int size);
+#else
+#define dbg_msg(fmt, ...)    ({})
+#define ubi_dbg_dump_stack() ({})
+#define ubi_dbg_print(func, fmt, ...)    ({})
+#define ubi_dbg_dump_ec_hdr(ec_hdr)      ({})
+#define ubi_dbg_dump_vid_hdr(vid_hdr)    ({})
+#define ubi_dbg_dump_vol_info(vol)       ({})
+#define ubi_dbg_dump_vtbl_record(r, idx) ({})
+#define ubi_dbg_dump_sv(sv)              ({})
+#define ubi_dbg_dump_seb(seb, type)      ({})
+#define ubi_dbg_dump_mkvol_req(req)      ({})
+#define ubi_dbg_hexdump(buf, size)       ({})
+#endif /* CONFIG_MTD_UBI_DEBUG_MSG */
+#ifdef CONFIG_MTD_UBI_DEBUG_MSG_EBA
+/* Messages from the eraseblock association unit */
+#define dbg_eba(fmt, ...) \
+        printk(KERN_DEBUG "UBI DBG eba: %s: " fmt "\n", __FUNCTION__, \
+               ##__VA_ARGS__)
+#else
+#define dbg_eba(fmt, ...) ({})
+#endif
+#ifdef CONFIG_MTD_UBI_DEBUG_MSG_WL
+/* Messages from the wear-leveling unit */
+#define dbg_wl(fmt, ...) \
+        printk(KERN_DEBUG "UBI DBG wl: %s: " fmt "\n", __FUNCTION__, \
+               ##__VA_ARGS__)
+#else
+#define dbg_wl(fmt, ...) ({})
+#endif
+#ifdef CONFIG_MTD_UBI_DEBUG_MSG_IO
+/* Messages from the input/output unit */
+#define dbg_io(fmt, ...) \
+        printk(KERN_DEBUG "UBI DBG io: %s: " fmt "\n", __FUNCTION__, \
+               ##__VA_ARGS__)
+#else
+#define dbg_io(fmt, ...) ({})
+#endif
+#ifdef CONFIG_MTD_UBI_DEBUG_MSG_BLD
+/* Initialization and build messages */
+#define dbg_bld(fmt, ...) \
+        printk(KERN_DEBUG "UBI DBG bld: %s: " fmt "\n", __FUNCTION__, \
+               ##__VA_ARGS__)
+#else
+#define dbg_bld(fmt, ...) ({})
+#endif
+#ifdef CONFIG_MTD_UBI_DEBUG_EMULATE_BITFLIPS
+/**
+ * ubi_dbg_is_bitflip - if it is time to emulate a bit-flip.
+ *
+ * Returns non-zero if a bit-flip should be emulated, otherwise returns zero.
+ */
+static inline int ubi_dbg_is_bitflip(void)
+{
+        return !(random32() % 200);
+}
+#else
+#define ubi_dbg_is_bitflip() 0
+#endif
+#ifdef CONFIG_MTD_UBI_DEBUG_EMULATE_WRITE_FAILURES
+/**
+ * ubi_dbg_is_write_failure - if it is time to emulate a write failure.
+ *
+ * Returns non-zero if a write failure should be emulated, otherwise returns
+ * zero.
+ */
+static inline int ubi_dbg_is_write_failure(void)
+{
+        return !(random32() % 500);
+}
+#else
+#define ubi_dbg_is_write_failure() 0
+#endif
+#ifdef CONFIG_MTD_UBI_DEBUG_EMULATE_ERASE_FAILURES
+/**
+ * ubi_dbg_is_erase_failure - if its time to emulate an erase failure.
+ *
+ * Returns non-zero if an erase failure should be emulated, otherwise returns
+ * zero.
+ */
+static inline int ubi_dbg_is_erase_failure(void)
+{
+                return !(random32() % 400);
+}
+#else
+#define ubi_dbg_is_erase_failure() 0
+#endif
+#endif /* !__UBI_DEBUG_H__ */
diff --git a/drivers/mtd/ubi/eba.c b/drivers/mtd/ubi/eba.c
new file mode 100644
index 000000000000..d847ee1da3d9
--- /dev/null
+++ b/drivers/mtd/ubi/eba.c
@@ -0,0 +1,1241 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Artem Bityutskiy (Битюцкий Артём)
+ */
+/*
+ * The UBI Eraseblock Association (EBA) unit.
+ *
+ * This unit is responsible for I/O to/from logical eraseblock.
+ *
+ * Although in this implementation the EBA table is fully kept and managed in
+ * RAM, which assumes poor scalability, it might be (partially) maintained on
+ * flash in future implementations.
+ *
+ * The EBA unit implements per-logical eraseblock locking. Before accessing a
+ * logical eraseblock it is locked for reading or writing. The per-logical
+ * eraseblock locking is implemented by means of the lock tree. The lock tree
+ * is an RB-tree which refers all the currently locked logical eraseblocks. The
+ * lock tree elements are &struct ltree_entry objects. They are indexed by
+ * (@vol_id, @lnum) pairs.
+ *
+ * EBA also maintains the global sequence counter which is incremented each
+ * time a logical eraseblock is mapped to a physical eraseblock and it is
+ * stored in the volume identifier header. This means that each VID header has
+ * a unique sequence number. The sequence number is only increased an we assume
+ * 64 bits is enough to never overflow.
+ */
+#include <linux/slab.h>
+#include <linux/crc32.h>
+#include <linux/err.h>
+#include "ubi.h"
+/**
+ * struct ltree_entry - an entry in the lock tree.
+ * @rb: links RB-tree nodes
+ * @vol_id: volume ID of the locked logical eraseblock
+ * @lnum: locked logical eraseblock number
+ * @users: how many tasks are using this logical eraseblock or wait for it
+ * @mutex: read/write mutex to implement read/write access serialization to
+ * the (@vol_id, @lnum) logical eraseblock
+ *
+ * When a logical eraseblock is being locked - corresponding &struct ltree_entry
+ * object is inserted to the lock tree (@ubi->ltree).
+ */
+struct ltree_entry {
+        struct rb_node rb;
+        int vol_id;
+        int lnum;
+        int users;
+        struct rw_semaphore mutex;
+};
+/* Slab cache for lock-tree entries */
+static struct kmem_cache *ltree_slab;
+/**
+ * next_sqnum - get next sequence number.
+ * @ubi: UBI device description object
+ *
+ * This function returns next sequence number to use, which is just the current
+ * global sequence counter value. It also increases the global sequence
+ * counter.
+ */
+static unsigned long long next_sqnum(struct ubi_device *ubi)
+{
+        unsigned long long sqnum;
+        spin_lock(&ubi->ltree_lock);
+        sqnum = ubi->global_sqnum++;
+        spin_unlock(&ubi->ltree_lock);
+        return sqnum;
+}
+/**
+ * ubi_get_compat - get compatibility flags of a volume.
+ * @ubi: UBI device description object
+ * @vol_id: volume ID
+ *
+ * This function returns compatibility flags for an internal volume. User
+ * volumes have no compatibility flags, so %0 is returned.
+ */
+static int ubi_get_compat(const struct ubi_device *ubi, int vol_id)
+{
+        if (vol_id == UBI_LAYOUT_VOL_ID)
+                return UBI_LAYOUT_VOLUME_COMPAT;
+        return 0;
+}
+/**
+ * ltree_lookup - look up the lock tree.
+ * @ubi: UBI device description object
+ * @vol_id: volume ID
+ * @lnum: logical eraseblock number
+ *
+ * This function returns a pointer to the corresponding &struct ltree_entry
+ * object if the logical eraseblock is locked and %NULL if it is not.
+ * @ubi->ltree_lock has to be locked.
+ */
+static struct ltree_entry *ltree_lookup(struct ubi_device *ubi, int vol_id,
+                                        int lnum)
+{
+        struct rb_node *p;
+        p = ubi->ltree.rb_node;
+        while (p) {
+                struct ltree_entry *le;
+                le = rb_entry(p, struct ltree_entry, rb);
+                if (vol_id < le->vol_id)
+                        p = p->rb_left;
+                else if (vol_id > le->vol_id)
+                        p = p->rb_right;
+                else {
+                        if (lnum < le->lnum)
+                                p = p->rb_left;
+                        else if (lnum > le->lnum)
+                                p = p->rb_right;
+                        else
+                                return le;
+                }
+        }
+        return NULL;
+}
+/**
+ * ltree_add_entry - add new entry to the lock tree.
+ * @ubi: UBI device description object
+ * @vol_id: volume ID
+ * @lnum: logical eraseblock number
+ *
+ * This function adds new entry for logical eraseblock (@vol_id, @lnum) to the
+ * lock tree. If such entry is already there, its usage counter is increased.
+ * Returns pointer to the lock tree entry or %-ENOMEM if memory allocation
+ * failed.
+ */
+static struct ltree_entry *ltree_add_entry(struct ubi_device *ubi, int vol_id,
+                                           int lnum)
+{
+        struct ltree_entry *le, *le1, *le_free;
+        le = kmem_cache_alloc(ltree_slab, GFP_KERNEL);
+        if (!le)
+                return ERR_PTR(-ENOMEM);
+        le->vol_id = vol_id;
+        le->lnum = lnum;
+        spin_lock(&ubi->ltree_lock);
+        le1 = ltree_lookup(ubi, vol_id, lnum);
+        if (le1) {
+                /*
+                 * This logical eraseblock is already locked. The newly
+                 * allocated lock entry is not needed.
+                 */
+                le_free = le;
+                le = le1;
+        } else {
+                struct rb_node **p, *parent = NULL;
+                /*
+                 * No lock entry, add the newly allocated one to the
+                 * @ubi->ltree RB-tree.
+                 */
+                le_free = NULL;
+                p = &ubi->ltree.rb_node;
+                while (*p) {
+                        parent = *p;
+                        le1 = rb_entry(parent, struct ltree_entry, rb);
+                        if (vol_id < le1->vol_id)
+                                p = &(*p)->rb_left;
+                        else if (vol_id > le1->vol_id)
+                                p = &(*p)->rb_right;
+                        else {
+                                ubi_assert(lnum != le1->lnum);
+                                if (lnum < le1->lnum)
+                                        p = &(*p)->rb_left;
+                                else
+                                        p = &(*p)->rb_right;
+                        }
+                }
+                rb_link_node(&le->rb, parent, p);
+                rb_insert_color(&le->rb, &ubi->ltree);
+        }
+        le->users += 1;
+        spin_unlock(&ubi->ltree_lock);
+        if (le_free)
+                kmem_cache_free(ltree_slab, le_free);
+        return le;
+}
+/**
+ * leb_read_lock - lock logical eraseblock for reading.
+ * @ubi: UBI device description object
+ * @vol_id: volume ID
+ * @lnum: logical eraseblock number
+ *
+ * This function locks a logical eraseblock for reading. Returns zero in case
+ * of success and a negative error code in case of failure.
+ */
+static int leb_read_lock(struct ubi_device *ubi, int vol_id, int lnum)
+{
+        struct ltree_entry *le;
+        le = ltree_add_entry(ubi, vol_id, lnum);
+        if (IS_ERR(le))
+                return PTR_ERR(le);
+        down_read(&le->mutex);
+        return 0;
+}
+/**
+ * leb_read_unlock - unlock logical eraseblock.
+ * @ubi: UBI device description object
+ * @vol_id: volume ID
+ * @lnum: logical eraseblock number
+ */
+static void leb_read_unlock(struct ubi_device *ubi, int vol_id, int lnum)
+{
+        int free = 0;
+        struct ltree_entry *le;
+        spin_lock(&ubi->ltree_lock);
+        le = ltree_lookup(ubi, vol_id, lnum);
+        le->users -= 1;
+        ubi_assert(le->users >= 0);
+        if (le->users == 0) {
+                rb_erase(&le->rb, &ubi->ltree);
+                free = 1;
+        }
+        spin_unlock(&ubi->ltree_lock);
+        up_read(&le->mutex);
+        if (free)
+                kmem_cache_free(ltree_slab, le);
+}
+/**
+ * leb_write_lock - lock logical eraseblock for writing.
+ * @ubi: UBI device description object
+ * @vol_id: volume ID
+ * @lnum: logical eraseblock number
+ *
+ * This function locks a logical eraseblock for writing. Returns zero in case
+ * of success and a negative error code in case of failure.
+ */
+static int leb_write_lock(struct ubi_device *ubi, int vol_id, int lnum)
+{
+        struct ltree_entry *le;
+        le = ltree_add_entry(ubi, vol_id, lnum);
+        if (IS_ERR(le))
+                return PTR_ERR(le);
+        down_write(&le->mutex);
+        return 0;
+}
+/**
+ * leb_write_unlock - unlock logical eraseblock.
+ * @ubi: UBI device description object
+ * @vol_id: volume ID
+ * @lnum: logical eraseblock number
+ */
+static void leb_write_unlock(struct ubi_device *ubi, int vol_id, int lnum)
+{
+        int free;
+        struct ltree_entry *le;
+        spin_lock(&ubi->ltree_lock);
+        le = ltree_lookup(ubi, vol_id, lnum);
+        le->users -= 1;
+        ubi_assert(le->users >= 0);
+        if (le->users == 0) {
+                rb_erase(&le->rb, &ubi->ltree);
+                free = 1;
+        } else
+                free = 0;
+        spin_unlock(&ubi->ltree_lock);
+        up_write(&le->mutex);
+        if (free)
+                kmem_cache_free(ltree_slab, le);
+}
+/**
+ * ubi_eba_unmap_leb - un-map logical eraseblock.
+ * @ubi: UBI device description object
+ * @vol_id: volume ID
+ * @lnum: logical eraseblock number
+ *
+ * This function un-maps logical eraseblock @lnum and schedules corresponding
+ * physical eraseblock for erasure. Returns zero in case of success and a
+ * negative error code in case of failure.
+ */
+int ubi_eba_unmap_leb(struct ubi_device *ubi, int vol_id, int lnum)
+{
+        int idx = vol_id2idx(ubi, vol_id), err, pnum;
+        struct ubi_volume *vol = ubi->volumes[idx];
+        if (ubi->ro_mode)
+                return -EROFS;
+        err = leb_write_lock(ubi, vol_id, lnum);
+        if (err)
+                return err;
+        pnum = vol->eba_tbl[lnum];
+        if (pnum < 0)
+                /* This logical eraseblock is already unmapped */
+                goto out_unlock;
+        dbg_eba("erase LEB %d:%d, PEB %d", vol_id, lnum, pnum);
+        vol->eba_tbl[lnum] = UBI_LEB_UNMAPPED;
+        err = ubi_wl_put_peb(ubi, pnum, 0);
+out_unlock:
+        leb_write_unlock(ubi, vol_id, lnum);
+        return err;
+}
+/**
+ * ubi_eba_read_leb - read data.
+ * @ubi: UBI device description object
+ * @vol_id: volume ID
+ * @lnum: logical eraseblock number
+ * @buf: buffer to store the read data
+ * @offset: offset from where to read
+ * @len: how many bytes to read
+ * @check: data CRC check flag
+ *
+ * If the logical eraseblock @lnum is unmapped, @buf is filled with 0xFF
+ * bytes. The @check flag only makes sense for static volumes and forces
+ * eraseblock data CRC checking.
+ *
+ * In case of success this function returns zero. In case of a static volume,
+ * if data CRC mismatches - %-EBADMSG is returned. %-EBADMSG may also be
+ * returned for any volume type if an ECC error was detected by the MTD device
+ * driver. Other negative error cored may be returned in case of other errors.
+ */
+int ubi_eba_read_leb(struct ubi_device *ubi, int vol_id, int lnum, void *buf,
+                     int offset, int len, int check)
+{
+        int err, pnum, scrub = 0, idx = vol_id2idx(ubi, vol_id);
+        struct ubi_vid_hdr *vid_hdr;
+        struct ubi_volume *vol = ubi->volumes[idx];
+        uint32_t crc, crc1;
+        err = leb_read_lock(ubi, vol_id, lnum);
+        if (err)
+                return err;
+        pnum = vol->eba_tbl[lnum];
+        if (pnum < 0) {
+                /*
+                 * The logical eraseblock is not mapped, fill the whole buffer
+                 * with 0xFF bytes. The exception is static volumes for which
+                 * it is an error to read unmapped logical eraseblocks.
+                 */
+                dbg_eba("read %d bytes from offset %d of LEB %d:%d (unmapped)",
+                        len, offset, vol_id, lnum);
+                leb_read_unlock(ubi, vol_id, lnum);
+                ubi_assert(vol->vol_type != UBI_STATIC_VOLUME);
+                memset(buf, 0xFF, len);
+                return 0;
+        }
+        dbg_eba("read %d bytes from offset %d of LEB %d:%d, PEB %d",
+                len, offset, vol_id, lnum, pnum);
+        if (vol->vol_type == UBI_DYNAMIC_VOLUME)
+                check = 0;
+retry:
+        if (check) {
+                vid_hdr = ubi_zalloc_vid_hdr(ubi);
+                if (!vid_hdr) {
+                        err = -ENOMEM;
+                        goto out_unlock;
+                }
+                err = ubi_io_read_vid_hdr(ubi, pnum, vid_hdr, 1);
+                if (err && err != UBI_IO_BITFLIPS) {
+                        if (err > 0) {
+                                /*
+                                 * The header is either absent or corrupted.
+                                 * The former case means there is a bug -
+                                 * switch to read-only mode just in case.
+                                 * The latter case means a real corruption - we
+                                 * may try to recover data. FIXME: but this is
+                                 * not implemented.
+                                 */
+                                if (err == UBI_IO_BAD_VID_HDR) {
+                                        ubi_warn("bad VID header at PEB %d, LEB"
+                                                 "%d:%d", pnum, vol_id, lnum);
+                                        err = -EBADMSG;
+                                } else
+                                        ubi_ro_mode(ubi);
+                        }
+                        goto out_free;
+                } else if (err == UBI_IO_BITFLIPS)
+                        scrub = 1;
+                ubi_assert(lnum < ubi32_to_cpu(vid_hdr->used_ebs));
+                ubi_assert(len == ubi32_to_cpu(vid_hdr->data_size));
+                crc = ubi32_to_cpu(vid_hdr->data_crc);
+                ubi_free_vid_hdr(ubi, vid_hdr);
+        }
+        err = ubi_io_read_data(ubi, buf, pnum, offset, len);
+        if (err) {
+                if (err == UBI_IO_BITFLIPS) {
+                        scrub = 1;
+                        err = 0;
+                } else if (err == -EBADMSG) {
+                        if (vol->vol_type == UBI_DYNAMIC_VOLUME)
+                                goto out_unlock;
+                        scrub = 1;
+                        if (!check) {
+                                ubi_msg("force data checking");
+                                check = 1;
+                                goto retry;
+                        }
+                } else
+                        goto out_unlock;
+        }
+        if (check) {
+                crc1 = crc32(UBI_CRC32_INIT, buf, len);
+                if (crc1 != crc) {
+                        ubi_warn("CRC error: calculated %#08x, must be %#08x",
+                                 crc1, crc);
+                        err = -EBADMSG;
+                        goto out_unlock;
+                }
+        }
+        if (scrub)
+                err = ubi_wl_scrub_peb(ubi, pnum);
+        leb_read_unlock(ubi, vol_id, lnum);
+        return err;
+out_free:
+        ubi_free_vid_hdr(ubi, vid_hdr);
+out_unlock:
+        leb_read_unlock(ubi, vol_id, lnum);
+        return err;
+}
+/**
+ * recover_peb - recover from write failure.
+ * @ubi: UBI device description object
+ * @pnum: the physical eraseblock to recover
+ * @vol_id: volume ID
+ * @lnum: logical eraseblock number
+ * @buf: data which was not written because of the write failure
+ * @offset: offset of the failed write
+ * @len: how many bytes should have been written
+ *
+ * This function is called in case of a write failure and moves all good data
+ * from the potentially bad physical eraseblock to a good physical eraseblock.
+ * This function also writes the data which was not written due to the failure.
+ * Returns new physical eraseblock number in case of success, and a negative
+ * error code in case of failure.
+ */
+static int recover_peb(struct ubi_device *ubi, int pnum, int vol_id, int lnum,
+                       const void *buf, int offset, int len)
+{
+        int err, idx = vol_id2idx(ubi, vol_id), new_pnum, data_size, tries = 0;
+        struct ubi_volume *vol = ubi->volumes[idx];
+        struct ubi_vid_hdr *vid_hdr;
+        unsigned char *new_buf;
+        vid_hdr = ubi_zalloc_vid_hdr(ubi);
+        if (!vid_hdr) {
+                return -ENOMEM;
+        }
+retry:
+        new_pnum = ubi_wl_get_peb(ubi, UBI_UNKNOWN);
+        if (new_pnum < 0) {
+                ubi_free_vid_hdr(ubi, vid_hdr);
+                return new_pnum;
+        }
+        ubi_msg("recover PEB %d, move data to PEB %d", pnum, new_pnum);
+        err = ubi_io_read_vid_hdr(ubi, pnum, vid_hdr, 1);
+        if (err && err != UBI_IO_BITFLIPS) {
+                if (err > 0)
+                        err = -EIO;
+                goto out_put;
+        }
+        vid_hdr->sqnum = cpu_to_ubi64(next_sqnum(ubi));
+        err = ubi_io_write_vid_hdr(ubi, new_pnum, vid_hdr);
+        if (err)
+                goto write_error;
+        data_size = offset + len;
+        new_buf = kmalloc(data_size, GFP_KERNEL);
+        if (!new_buf) {
+                err = -ENOMEM;
+                goto out_put;
+        }
+        memset(new_buf + offset, 0xFF, len);
+        /* Read everything before the area where the write failure happened */
+        if (offset > 0) {
+                err = ubi_io_read_data(ubi, new_buf, pnum, 0, offset);
+                if (err && err != UBI_IO_BITFLIPS) {
+                        kfree(new_buf);
+                        goto out_put;
+                }
+        }
+        memcpy(new_buf + offset, buf, len);
+        err = ubi_io_write_data(ubi, new_buf, new_pnum, 0, data_size);
+        if (err) {
+                kfree(new_buf);
+                goto write_error;
+        }
+        kfree(new_buf);
+        ubi_free_vid_hdr(ubi, vid_hdr);
+        vol->eba_tbl[lnum] = new_pnum;
+        ubi_wl_put_peb(ubi, pnum, 1);
+        ubi_msg("data was successfully recovered");
+        return 0;
+out_put:
+        ubi_wl_put_peb(ubi, new_pnum, 1);
+        ubi_free_vid_hdr(ubi, vid_hdr);
+        return err;
+write_error:
+        /*
+         * Bad luck? This physical eraseblock is bad too? Crud. Let's try to
+         * get another one.
+         */
+        ubi_warn("failed to write to PEB %d", new_pnum);
+        ubi_wl_put_peb(ubi, new_pnum, 1);
+        if (++tries > UBI_IO_RETRIES) {
+                ubi_free_vid_hdr(ubi, vid_hdr);
+                return err;
+        }
+        ubi_msg("try again");
+        goto retry;
+}
+/**
+ * ubi_eba_write_leb - write data to dynamic volume.
+ * @ubi: UBI device description object
+ * @vol_id: volume ID
+ * @lnum: logical eraseblock number
+ * @buf: the data to write
+ * @offset: offset within the logical eraseblock where to write
+ * @len: how many bytes to write
+ * @dtype: data type
+ *
+ * This function writes data to logical eraseblock @lnum of a dynamic volume
+ * @vol_id. Returns zero in case of success and a negative error code in case
+ * of failure. In case of error, it is possible that something was still
+ * written to the flash media, but may be some garbage.
+ */
+int ubi_eba_write_leb(struct ubi_device *ubi, int vol_id, int lnum,
+                      const void *buf, int offset, int len, int dtype)
+{
+        int idx = vol_id2idx(ubi, vol_id), err, pnum, tries = 0;
+        struct ubi_volume *vol = ubi->volumes[idx];
+        struct ubi_vid_hdr *vid_hdr;
+        if (ubi->ro_mode)
+                return -EROFS;
+        err = leb_write_lock(ubi, vol_id, lnum);
+        if (err)
+                return err;
+        pnum = vol->eba_tbl[lnum];
+        if (pnum >= 0) {
+                dbg_eba("write %d bytes at offset %d of LEB %d:%d, PEB %d",
+                        len, offset, vol_id, lnum, pnum);
+                err = ubi_io_write_data(ubi, buf, pnum, offset, len);
+                if (err) {
+                        ubi_warn("failed to write data to PEB %d", pnum);
+                        if (err == -EIO && ubi->bad_allowed)
+                                err = recover_peb(ubi, pnum, vol_id, lnum, buf, offset, len);
+                        if (err)
+                                ubi_ro_mode(ubi);
+                }
+                leb_write_unlock(ubi, vol_id, lnum);
+                return err;
+        }
+        /*
+         * The logical eraseblock is not mapped. We have to get a free physical
+         * eraseblock and write the volume identifier header there first.
+         */
+        vid_hdr = ubi_zalloc_vid_hdr(ubi);
+        if (!vid_hdr) {
+                leb_write_unlock(ubi, vol_id, lnum);
+                return -ENOMEM;
+        }
+        vid_hdr->vol_type = UBI_VID_DYNAMIC;
+        vid_hdr->sqnum = cpu_to_ubi64(next_sqnum(ubi));
+        vid_hdr->vol_id = cpu_to_ubi32(vol_id);
+        vid_hdr->lnum = cpu_to_ubi32(lnum);
+        vid_hdr->compat = ubi_get_compat(ubi, vol_id);
+        vid_hdr->data_pad = cpu_to_ubi32(vol->data_pad);
+retry:
+        pnum = ubi_wl_get_peb(ubi, dtype);
+        if (pnum < 0) {
+                ubi_free_vid_hdr(ubi, vid_hdr);
+                leb_write_unlock(ubi, vol_id, lnum);
+                return pnum;
+        }
+        dbg_eba("write VID hdr and %d bytes at offset %d of LEB %d:%d, PEB %d",
+                len, offset, vol_id, lnum, pnum);
+        err = ubi_io_write_vid_hdr(ubi, pnum, vid_hdr);
+        if (err) {
+                ubi_warn("failed to write VID header to LEB %d:%d, PEB %d",
+                         vol_id, lnum, pnum);
+                goto write_error;
+        }
+        err = ubi_io_write_data(ubi, buf, pnum, offset, len);
+        if (err) {
+                ubi_warn("failed to write %d bytes at offset %d of LEB %d:%d, "
+                         "PEB %d", len, offset, vol_id, lnum, pnum);
+                goto write_error;
+        }
+        vol->eba_tbl[lnum] = pnum;
+        leb_write_unlock(ubi, vol_id, lnum);
+        ubi_free_vid_hdr(ubi, vid_hdr);
+        return 0;
+write_error:
+        if (err != -EIO || !ubi->bad_allowed) {
+                ubi_ro_mode(ubi);
+                leb_write_unlock(ubi, vol_id, lnum);
+                ubi_free_vid_hdr(ubi, vid_hdr);
+                return err;
+        }
+        /*
+         * Fortunately, this is the first write operation to this physical
+         * eraseblock, so just put it and request a new one. We assume that if
+         * this physical eraseblock went bad, the erase code will handle that.
+         */
+        err = ubi_wl_put_peb(ubi, pnum, 1);
+        if (err || ++tries > UBI_IO_RETRIES) {
+                ubi_ro_mode(ubi);
+                leb_write_unlock(ubi, vol_id, lnum);
+                ubi_free_vid_hdr(ubi, vid_hdr);
+                return err;
+        }
+        vid_hdr->sqnum = cpu_to_ubi64(next_sqnum(ubi));
+        ubi_msg("try another PEB");
+        goto retry;
+}
+/**
+ * ubi_eba_write_leb_st - write data to static volume.
+ * @ubi: UBI device description object
+ * @vol_id: volume ID
+ * @lnum: logical eraseblock number
+ * @buf: data to write
+ * @len: how many bytes to write
+ * @dtype: data type
+ * @used_ebs: how many logical eraseblocks will this volume contain
+ *
+ * This function writes data to logical eraseblock @lnum of static volume
+ * @vol_id. The @used_ebs argument should contain total number of logical
+ * eraseblock in this static volume.
+ *
+ * When writing to the last logical eraseblock, the @len argument doesn't have
+ * to be aligned to the minimal I/O unit size. Instead, it has to be equivalent
+ * to the real data size, although the @buf buffer has to contain the
+ * alignment. In all other cases, @len has to be aligned.
+ *
+ * It is prohibited to write more then once to logical eraseblocks of static
+ * volumes. This function returns zero in case of success and a negative error
+ * code in case of failure.
+ */
+int ubi_eba_write_leb_st(struct ubi_device *ubi, int vol_id, int lnum,
+                         const void *buf, int len, int dtype, int used_ebs)
+{
+        int err, pnum, tries = 0, data_size = len;
+        int idx = vol_id2idx(ubi, vol_id);
+        struct ubi_volume *vol = ubi->volumes[idx];
+        struct ubi_vid_hdr *vid_hdr;
+        uint32_t crc;
+        if (ubi->ro_mode)
+                return -EROFS;
+        if (lnum == used_ebs - 1)
+                /* If this is the last LEB @len may be unaligned */
+                len = ALIGN(data_size, ubi->min_io_size);
+        else
+                ubi_assert(len % ubi->min_io_size == 0);
+        vid_hdr = ubi_zalloc_vid_hdr(ubi);
+        if (!vid_hdr)
+                return -ENOMEM;
+        err = leb_write_lock(ubi, vol_id, lnum);
+        if (err) {
+                ubi_free_vid_hdr(ubi, vid_hdr);
+                return err;
+        }
+        vid_hdr->sqnum = cpu_to_ubi64(next_sqnum(ubi));
+        vid_hdr->vol_id = cpu_to_ubi32(vol_id);
+        vid_hdr->lnum = cpu_to_ubi32(lnum);
+        vid_hdr->compat = ubi_get_compat(ubi, vol_id);
+        vid_hdr->data_pad = cpu_to_ubi32(vol->data_pad);
+        crc = crc32(UBI_CRC32_INIT, buf, data_size);
+        vid_hdr->vol_type = UBI_VID_STATIC;
+        vid_hdr->data_size = cpu_to_ubi32(data_size);
+        vid_hdr->used_ebs = cpu_to_ubi32(used_ebs);
+        vid_hdr->data_crc = cpu_to_ubi32(crc);
+retry:
+        pnum = ubi_wl_get_peb(ubi, dtype);
+        if (pnum < 0) {
+                ubi_free_vid_hdr(ubi, vid_hdr);
+                leb_write_unlock(ubi, vol_id, lnum);
+                return pnum;
+        }
+        dbg_eba("write VID hdr and %d bytes at LEB %d:%d, PEB %d, used_ebs %d",
+                len, vol_id, lnum, pnum, used_ebs);
+        err = ubi_io_write_vid_hdr(ubi, pnum, vid_hdr);
+        if (err) {
+                ubi_warn("failed to write VID header to LEB %d:%d, PEB %d",
+                         vol_id, lnum, pnum);
+                goto write_error;
+        }
+        err = ubi_io_write_data(ubi, buf, pnum, 0, len);
+        if (err) {
+                ubi_warn("failed to write %d bytes of data to PEB %d",
+                         len, pnum);
+                goto write_error;
+        }
+        ubi_assert(vol->eba_tbl[lnum] < 0);
+        vol->eba_tbl[lnum] = pnum;
+        leb_write_unlock(ubi, vol_id, lnum);
+        ubi_free_vid_hdr(ubi, vid_hdr);
+        return 0;
+write_error:
+        if (err != -EIO || !ubi->bad_allowed) {
+                /*
+                 * This flash device does not admit of bad eraseblocks or
+                 * something nasty and unexpected happened. Switch to read-only
+                 * mode just in case.
+                 */
+                ubi_ro_mode(ubi);
+                leb_write_unlock(ubi, vol_id, lnum);
+                ubi_free_vid_hdr(ubi, vid_hdr);
+                return err;
+        }
+        err = ubi_wl_put_peb(ubi, pnum, 1);
+        if (err || ++tries > UBI_IO_RETRIES) {
+                ubi_ro_mode(ubi);
+                leb_write_unlock(ubi, vol_id, lnum);
+                ubi_free_vid_hdr(ubi, vid_hdr);
+                return err;
+        }
+        vid_hdr->sqnum = cpu_to_ubi64(next_sqnum(ubi));
+        ubi_msg("try another PEB");
+        goto retry;
+}
+/*
+ * ubi_eba_atomic_leb_change - change logical eraseblock atomically.
+ * @ubi: UBI device description object
+ * @vol_id: volume ID
+ * @lnum: logical eraseblock number
+ * @buf: data to write
+ * @len: how many bytes to write
+ * @dtype: data type
+ *
+ * This function changes the contents of a logical eraseblock atomically. @buf
+ * has to contain new logical eraseblock data, and @len - the length of the
+ * data, which has to be aligned. This function guarantees that in case of an
+ * unclean reboot the old contents is preserved. Returns zero in case of
+ * success and a negative error code in case of failure.
+ */
+int ubi_eba_atomic_leb_change(struct ubi_device *ubi, int vol_id, int lnum,
+                              const void *buf, int len, int dtype)
+{
+        int err, pnum, tries = 0, idx = vol_id2idx(ubi, vol_id);
+        struct ubi_volume *vol = ubi->volumes[idx];
+        struct ubi_vid_hdr *vid_hdr;
+        uint32_t crc;
+        if (ubi->ro_mode)
+                return -EROFS;
+        vid_hdr = ubi_zalloc_vid_hdr(ubi);
+        if (!vid_hdr)
+                return -ENOMEM;
+        err = leb_write_lock(ubi, vol_id, lnum);
+        if (err) {
+                ubi_free_vid_hdr(ubi, vid_hdr);
+                return err;
+        }
+        vid_hdr->sqnum = cpu_to_ubi64(next_sqnum(ubi));
+        vid_hdr->vol_id = cpu_to_ubi32(vol_id);
+        vid_hdr->lnum = cpu_to_ubi32(lnum);
+        vid_hdr->compat = ubi_get_compat(ubi, vol_id);
+        vid_hdr->data_pad = cpu_to_ubi32(vol->data_pad);
+        crc = crc32(UBI_CRC32_INIT, buf, len);
+        vid_hdr->vol_type = UBI_VID_STATIC;
+        vid_hdr->data_size = cpu_to_ubi32(len);
+        vid_hdr->copy_flag = 1;
+        vid_hdr->data_crc = cpu_to_ubi32(crc);
+retry:
+        pnum = ubi_wl_get_peb(ubi, dtype);
+        if (pnum < 0) {
+                ubi_free_vid_hdr(ubi, vid_hdr);
+                leb_write_unlock(ubi, vol_id, lnum);
+                return pnum;
+        }
+        dbg_eba("change LEB %d:%d, PEB %d, write VID hdr to PEB %d",
+                vol_id, lnum, vol->eba_tbl[lnum], pnum);
+        err = ubi_io_write_vid_hdr(ubi, pnum, vid_hdr);
+        if (err) {
+                ubi_warn("failed to write VID header to LEB %d:%d, PEB %d",
+                         vol_id, lnum, pnum);
+                goto write_error;
+        }
+        err = ubi_io_write_data(ubi, buf, pnum, 0, len);
+        if (err) {
+                ubi_warn("failed to write %d bytes of data to PEB %d",
+                         len, pnum);
+                goto write_error;
+        }
+        err = ubi_wl_put_peb(ubi, vol->eba_tbl[lnum], 1);
+        if (err) {
+                ubi_free_vid_hdr(ubi, vid_hdr);
+                leb_write_unlock(ubi, vol_id, lnum);
+                return err;
+        }
+        vol->eba_tbl[lnum] = pnum;
+        leb_write_unlock(ubi, vol_id, lnum);
+        ubi_free_vid_hdr(ubi, vid_hdr);
+        return 0;
+write_error:
+        if (err != -EIO || !ubi->bad_allowed) {
+                /*
+                 * This flash device does not admit of bad eraseblocks or
+                 * something nasty and unexpected happened. Switch to read-only
+                 * mode just in case.
+                 */
+                ubi_ro_mode(ubi);
+                leb_write_unlock(ubi, vol_id, lnum);
+                ubi_free_vid_hdr(ubi, vid_hdr);
+                return err;
+        }
+        err = ubi_wl_put_peb(ubi, pnum, 1);
+        if (err || ++tries > UBI_IO_RETRIES) {
+                ubi_ro_mode(ubi);
+                leb_write_unlock(ubi, vol_id, lnum);
+                ubi_free_vid_hdr(ubi, vid_hdr);
+                return err;
+        }
+        vid_hdr->sqnum = cpu_to_ubi64(next_sqnum(ubi));
+        ubi_msg("try another PEB");
+        goto retry;
+}
+/**
+ * ltree_entry_ctor - lock tree entries slab cache constructor.
+ * @obj: the lock-tree entry to construct
+ * @cache: the lock tree entry slab cache
+ * @flags: constructor flags
+ */
+static void ltree_entry_ctor(void *obj, struct kmem_cache *cache,
+                             unsigned long flags)
+{
+        struct ltree_entry *le = obj;
+        if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) !=
+            SLAB_CTOR_CONSTRUCTOR)
+                return;
+        le->users = 0;
+        init_rwsem(&le->mutex);
+}
+/**
+ * ubi_eba_copy_leb - copy logical eraseblock.
+ * @ubi: UBI device description object
+ * @from: physical eraseblock number from where to copy
+ * @to: physical eraseblock number where to copy
+ * @vid_hdr: VID header of the @from physical eraseblock
+ *
+ * This function copies logical eraseblock from physical eraseblock @from to
+ * physical eraseblock @to. The @vid_hdr buffer may be changed by this
+ * function. Returns zero in case of success, %UBI_IO_BITFLIPS if the operation
+ * was canceled because bit-flips were detected at the target PEB, and a
+ * negative error code in case of failure.
+ */
+int ubi_eba_copy_leb(struct ubi_device *ubi, int from, int to,
+                     struct ubi_vid_hdr *vid_hdr)
+{
+        int err, vol_id, lnum, data_size, aldata_size, pnum, idx;
+        struct ubi_volume *vol;
+        uint32_t crc;
+        void *buf, *buf1 = NULL;
+        vol_id = ubi32_to_cpu(vid_hdr->vol_id);
+        lnum = ubi32_to_cpu(vid_hdr->lnum);
+        dbg_eba("copy LEB %d:%d, PEB %d to PEB %d", vol_id, lnum, from, to);
+        if (vid_hdr->vol_type == UBI_VID_STATIC) {
+                data_size = ubi32_to_cpu(vid_hdr->data_size);
+                aldata_size = ALIGN(data_size, ubi->min_io_size);
+        } else
+                data_size = aldata_size =
+                            ubi->leb_size - ubi32_to_cpu(vid_hdr->data_pad);
+        buf = kmalloc(aldata_size, GFP_KERNEL);
+        if (!buf)
+                return -ENOMEM;
+        /*
+         * We do not want anybody to write to this logical eraseblock while we
+         * are moving it, so we lock it.
+         */
+        err = leb_write_lock(ubi, vol_id, lnum);
+        if (err) {
+                kfree(buf);
+                return err;
+        }
+        /*
+         * But the logical eraseblock might have been put by this time.
+         * Cancel if it is true.
+         */
+        idx = vol_id2idx(ubi, vol_id);
+        /*
+         * We may race with volume deletion/re-size, so we have to hold
+         * @ubi->volumes_lock.
+         */
+        spin_lock(&ubi->volumes_lock);
+        vol = ubi->volumes[idx];
+        if (!vol) {
+                dbg_eba("volume %d was removed meanwhile", vol_id);
+                spin_unlock(&ubi->volumes_lock);
+                goto out_unlock;
+        }
+        pnum = vol->eba_tbl[lnum];
+        if (pnum != from) {
+                dbg_eba("LEB %d:%d is no longer mapped to PEB %d, mapped to "
+                        "PEB %d, cancel", vol_id, lnum, from, pnum);
+                spin_unlock(&ubi->volumes_lock);
+                goto out_unlock;
+        }
+        spin_unlock(&ubi->volumes_lock);
+        /* OK, now the LEB is locked and we can safely start moving it */
+        dbg_eba("read %d bytes of data", aldata_size);
+        err = ubi_io_read_data(ubi, buf, from, 0, aldata_size);
+        if (err && err != UBI_IO_BITFLIPS) {
+                ubi_warn("error %d while reading data from PEB %d",
+                         err, from);
+                goto out_unlock;
+        }
+        /*
+         * Now we have got to calculate how much data we have to to copy. In
+         * case of a static volume it is fairly easy - the VID header contains
+         * the data size. In case of a dynamic volume it is more difficult - we
+         * have to read the contents, cut 0xFF bytes from the end and copy only
+         * the first part. We must do this to avoid writing 0xFF bytes as it
+         * may have some side-effects. And not only this. It is important not
+         * to include those 0xFFs to CRC because later the they may be filled
+         * by data.
+         */
+        if (vid_hdr->vol_type == UBI_VID_DYNAMIC)
+                aldata_size = data_size =
+                                ubi_calc_data_len(ubi, buf, data_size);
+        cond_resched();
+        crc = crc32(UBI_CRC32_INIT, buf, data_size);
+        cond_resched();
+        /*
+         * It may turn out to me that the whole @from physical eraseblock
+         * contains only 0xFF bytes. Then we have to only write the VID header
+         * and do not write any data. This also means we should not set
+         * @vid_hdr->copy_flag, @vid_hdr->data_size, and @vid_hdr->data_crc.
+         */
+        if (data_size > 0) {
+                vid_hdr->copy_flag = 1;
+                vid_hdr->data_size = cpu_to_ubi32(data_size);
+                vid_hdr->data_crc = cpu_to_ubi32(crc);
+        }
+        vid_hdr->sqnum = cpu_to_ubi64(next_sqnum(ubi));
+        err = ubi_io_write_vid_hdr(ubi, to, vid_hdr);
+        if (err)
+                goto out_unlock;
+        cond_resched();
+        /* Read the VID header back and check if it was written correctly */
+        err = ubi_io_read_vid_hdr(ubi, to, vid_hdr, 1);
+        if (err) {
+                if (err != UBI_IO_BITFLIPS)
+                        ubi_warn("cannot read VID header back from PEB %d", to);
+                goto out_unlock;
+        }
+        if (data_size > 0) {
+                err = ubi_io_write_data(ubi, buf, to, 0, aldata_size);
+                if (err)
+                        goto out_unlock;
+                /*
+                 * We've written the data and are going to read it back to make
+                 * sure it was written correctly.
+                 */
+                buf1 = kmalloc(aldata_size, GFP_KERNEL);
+                if (!buf1) {
+                        err = -ENOMEM;
+                        goto out_unlock;
+                }
+                cond_resched();
+                err = ubi_io_read_data(ubi, buf1, to, 0, aldata_size);
+                if (err) {
+                        if (err != UBI_IO_BITFLIPS)
+                                ubi_warn("cannot read data back from PEB %d",
+                                         to);
+                        goto out_unlock;
+                }
+                cond_resched();
+                if (memcmp(buf, buf1, aldata_size)) {
+                        ubi_warn("read data back from PEB %d - it is different",
+                                 to);
+                        goto out_unlock;
+                }
+        }
+        ubi_assert(vol->eba_tbl[lnum] == from);
+        vol->eba_tbl[lnum] = to;
+        leb_write_unlock(ubi, vol_id, lnum);
+        kfree(buf);
+        kfree(buf1);
+        return 0;
+out_unlock:
+        leb_write_unlock(ubi, vol_id, lnum);
+        kfree(buf);
+        kfree(buf1);
+        return err;
+}
+/**
+ * ubi_eba_init_scan - initialize the EBA unit using scanning information.
+ * @ubi: UBI device description object
+ * @si: scanning information
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+int ubi_eba_init_scan(struct ubi_device *ubi, struct ubi_scan_info *si)
+{
+        int i, j, err, num_volumes;
+        struct ubi_scan_volume *sv;
+        struct ubi_volume *vol;
+        struct ubi_scan_leb *seb;
+        struct rb_node *rb;
+        dbg_eba("initialize EBA unit");
+        spin_lock_init(&ubi->ltree_lock);
+        ubi->ltree = RB_ROOT;
+        if (ubi_devices_cnt == 0) {
+                ltree_slab = kmem_cache_create("ubi_ltree_slab",
+                                               sizeof(struct ltree_entry), 0,
+                                               0, &ltree_entry_ctor, NULL);
+                if (!ltree_slab)
+                        return -ENOMEM;
+        }
+        ubi->global_sqnum = si->max_sqnum + 1;
+        num_volumes = ubi->vtbl_slots + UBI_INT_VOL_COUNT;
+        for (i = 0; i < num_volumes; i++) {
+                vol = ubi->volumes[i];
+                if (!vol)
+                        continue;
+                cond_resched();
+                vol->eba_tbl = kmalloc(vol->reserved_pebs * sizeof(int),
+                                       GFP_KERNEL);
+                if (!vol->eba_tbl) {
+                        err = -ENOMEM;
+                        goto out_free;
+                }
+                for (j = 0; j < vol->reserved_pebs; j++)
+                        vol->eba_tbl[j] = UBI_LEB_UNMAPPED;
+                sv = ubi_scan_find_sv(si, idx2vol_id(ubi, i));
+                if (!sv)
+                        continue;
+                ubi_rb_for_each_entry(rb, seb, &sv->root, u.rb) {
+                        if (seb->lnum >= vol->reserved_pebs)
+                                /*
+                                 * This may happen in case of an unclean reboot
+                                 * during re-size.
+                                 */
+                                ubi_scan_move_to_list(sv, seb, &si->erase);
+                        vol->eba_tbl[seb->lnum] = seb->pnum;
+                }
+        }
+        if (ubi->bad_allowed) {
+                ubi_calculate_reserved(ubi);
+                if (ubi->avail_pebs < ubi->beb_rsvd_level) {
+                        /* No enough free physical eraseblocks */
+                        ubi->beb_rsvd_pebs = ubi->avail_pebs;
+                        ubi_warn("cannot reserve enough PEBs for bad PEB "
+                                 "handling, reserved %d, need %d",
+                                 ubi->beb_rsvd_pebs, ubi->beb_rsvd_level);
+                } else
+                        ubi->beb_rsvd_pebs = ubi->beb_rsvd_level;
+                ubi->avail_pebs -= ubi->beb_rsvd_pebs;
+                ubi->rsvd_pebs  += ubi->beb_rsvd_pebs;
+        }
+        dbg_eba("EBA unit is initialized");
+        return 0;
+out_free:
+        for (i = 0; i < num_volumes; i++) {
+                if (!ubi->volumes[i])
+                        continue;
+                kfree(ubi->volumes[i]->eba_tbl);
+        }
+        if (ubi_devices_cnt == 0)
+                kmem_cache_destroy(ltree_slab);
+        return err;
+}
+/**
+ * ubi_eba_close - close EBA unit.
+ * @ubi: UBI device description object
+ */
+void ubi_eba_close(const struct ubi_device *ubi)
+{
+        int i, num_volumes = ubi->vtbl_slots + UBI_INT_VOL_COUNT;
+        dbg_eba("close EBA unit");
+        for (i = 0; i < num_volumes; i++) {
+                if (!ubi->volumes[i])
+                        continue;
+                kfree(ubi->volumes[i]->eba_tbl);
+        }
+        if (ubi_devices_cnt == 1)
+                kmem_cache_destroy(ltree_slab);
+}
diff --git a/drivers/mtd/ubi/gluebi.c b/drivers/mtd/ubi/gluebi.c
new file mode 100644
index 000000000000..fc9478d605ff
--- /dev/null
+++ b/drivers/mtd/ubi/gluebi.c
@@ -0,0 +1,323 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Artem Bityutskiy (Битюцкий Артём), Joern Engel
+ */
+/*
+ * This file includes implementation of fake MTD devices for each UBI volume.
+ * This sounds strange, but it is in fact quite useful to make MTD-oriented
+ * software (including all the legacy software) to work on top of UBI.
+ *
+ * Gluebi emulates MTD devices of "MTD_UBIVOLUME" type. Their minimal I/O unit
+ * size (mtd->writesize) is equivalent to the UBI minimal I/O unit. The
+ * eraseblock size is equivalent to the logical eraseblock size of the volume.
+ */
+#include <asm/div64.h>
+#include "ubi.h"
+/**
+ * gluebi_get_device - get MTD device reference.
+ * @mtd: the MTD device description object
+ *
+ * This function is called every time the MTD device is being opened and
+ * implements the MTD get_device() operation. Returns zero in case of success
+ * and a negative error code in case of failure.
+ */
+static int gluebi_get_device(struct mtd_info *mtd)
+{
+        struct ubi_volume *vol;
+        vol = container_of(mtd, struct ubi_volume, gluebi_mtd);
+        /*
+         * We do not introduce locks for gluebi reference count because the
+         * get_device()/put_device() calls are already serialized at MTD.
+         */
+        if (vol->gluebi_refcount > 0) {
+                /*
+                 * The MTD device is already referenced and this is just one
+                 * more reference. MTD allows many users to open the same
+                 * volume simultaneously and do not distinguish between
+                 * readers/writers/exclusive openers as UBI does. So we do not
+                 * open the UBI volume again - just increase the reference
+                 * counter and return.
+                 */
+                vol->gluebi_refcount += 1;
+                return 0;
+        }
+        /*
+         * This is the first reference to this UBI volume via the MTD device
+         * interface. Open the corresponding volume in read-write mode.
+         */
+        vol->gluebi_desc = ubi_open_volume(vol->ubi->ubi_num, vol->vol_id,
+                                           UBI_READWRITE);
+        if (IS_ERR(vol->gluebi_desc))
+                return PTR_ERR(vol->gluebi_desc);
+        vol->gluebi_refcount += 1;
+        return 0;
+}
+/**
+ * gluebi_put_device - put MTD device reference.
+ * @mtd: the MTD device description object
+ *
+ * This function is called every time the MTD device is being put. Returns
+ * zero in case of success and a negative error code in case of failure.
+ */
+static void gluebi_put_device(struct mtd_info *mtd)
+{
+        struct ubi_volume *vol;
+        vol = container_of(mtd, struct ubi_volume, gluebi_mtd);
+        vol->gluebi_refcount -= 1;
+        ubi_assert(vol->gluebi_refcount >= 0);
+        if (vol->gluebi_refcount == 0)
+                ubi_close_volume(vol->gluebi_desc);
+}
+/**
+ * gluebi_read - read operation of emulated MTD devices.
+ * @mtd: MTD device description object
+ * @from: absolute offset from where to read
+ * @len: how many bytes to read
+ * @retlen: count of read bytes is returned here
+ * @buf: buffer to store the read data
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int gluebi_read(struct mtd_info *mtd, loff_t from, size_t len,
+                       size_t *retlen, unsigned char *buf)
+{
+        int err = 0, lnum, offs, total_read;
+        struct ubi_volume *vol;
+        struct ubi_device *ubi;
+        uint64_t tmp = from;
+        dbg_msg("read %zd bytes from offset %lld", len, from);
+        if (len < 0 || from < 0 || from + len > mtd->size)
+                return -EINVAL;
+        vol = container_of(mtd, struct ubi_volume, gluebi_mtd);
+        ubi = vol->ubi;
+        offs = do_div(tmp, mtd->erasesize);
+        lnum = tmp;
+        total_read = len;
+        while (total_read) {
+                size_t to_read = mtd->erasesize - offs;
+                if (to_read > total_read)
+                        to_read = total_read;
+                err = ubi_eba_read_leb(ubi, vol->vol_id, lnum, buf, offs,
+                                       to_read, 0);
+                if (err)
+                        break;
+                lnum += 1;
+                offs = 0;
+                total_read -= to_read;
+                buf += to_read;
+        }
+        *retlen = len - total_read;
+        return err;
+}
+/**
+ * gluebi_write - write operation of emulated MTD devices.
+ * @mtd: MTD device description object
+ * @to: absolute offset where to write
+ * @len: how many bytes to write
+ * @retlen: count of written bytes is returned here
+ * @buf: buffer with data to write
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int gluebi_write(struct mtd_info *mtd, loff_t to, size_t len,
+                       size_t *retlen, const u_char *buf)
+{
+        int err = 0, lnum, offs, total_written;
+        struct ubi_volume *vol;
+        struct ubi_device *ubi;
+        uint64_t tmp = to;
+        dbg_msg("write %zd bytes to offset %lld", len, to);
+        if (len < 0 || to < 0 || len + to > mtd->size)
+                return -EINVAL;
+        vol = container_of(mtd, struct ubi_volume, gluebi_mtd);
+        ubi = vol->ubi;
+        if (ubi->ro_mode)
+                return -EROFS;
+        offs = do_div(tmp, mtd->erasesize);
+        lnum = tmp;
+        if (len % mtd->writesize || offs % mtd->writesize)
+                return -EINVAL;
+        total_written = len;
+        while (total_written) {
+                size_t to_write = mtd->erasesize - offs;
+                if (to_write > total_written)
+                        to_write = total_written;
+                err = ubi_eba_write_leb(ubi, vol->vol_id, lnum, buf, offs,
+                                        to_write, UBI_UNKNOWN);
+                if (err)
+                        break;
+                lnum += 1;
+                offs = 0;
+                total_written -= to_write;
+                buf += to_write;
+        }
+        *retlen = len - total_written;
+        return err;
+}
+/**
+ * gluebi_erase - erase operation of emulated MTD devices.
+ * @mtd: the MTD device description object
+ * @instr: the erase operation description
+ *
+ * This function calls the erase callback when finishes. Returns zero in case
+ * of success and a negative error code in case of failure.
+ */
+static int gluebi_erase(struct mtd_info *mtd, struct erase_info *instr)
+{
+        int err, i, lnum, count;
+        struct ubi_volume *vol;
+        struct ubi_device *ubi;
+        dbg_msg("erase %u bytes at offset %u", instr->len, instr->addr);
+        if (instr->addr < 0 || instr->addr > mtd->size - mtd->erasesize)
+                return -EINVAL;
+        if (instr->len < 0 || instr->addr + instr->len > mtd->size)
+                return -EINVAL;
+        if (instr->addr % mtd->writesize || instr->len % mtd->writesize)
+                return -EINVAL;
+        lnum = instr->addr / mtd->erasesize;
+        count = instr->len / mtd->erasesize;
+        vol = container_of(mtd, struct ubi_volume, gluebi_mtd);
+        ubi = vol->ubi;
+        if (ubi->ro_mode)
+                return -EROFS;
+        for (i = 0; i < count; i++) {
+                err = ubi_eba_unmap_leb(ubi, vol->vol_id, lnum + i);
+                if (err)
+                        goto out_err;
+        }
+        /*
+         * MTD erase operations are synchronous, so we have to make sure the
+         * physical eraseblock is wiped out.
+         */
+        err = ubi_wl_flush(ubi);
+        if (err)
+                goto out_err;
+        instr->state = MTD_ERASE_DONE;
+        mtd_erase_callback(instr);
+        return 0;
+out_err:
+        instr->state = MTD_ERASE_FAILED;
+        instr->fail_addr = lnum * mtd->erasesize;
+        return err;
+}
+/**
+ * ubi_create_gluebi - initialize gluebi for an UBI volume.
+ * @ubi: UBI device description object
+ * @vol: volume description object
+ *
+ * This function is called when an UBI volume is created in order to create
+ * corresponding fake MTD device. Returns zero in case of success and a
+ * negative error code in case of failure.
+ */
+int ubi_create_gluebi(struct ubi_device *ubi, struct ubi_volume *vol)
+{
+        struct mtd_info *mtd = &vol->gluebi_mtd;
+        mtd->name = kmemdup(vol->name, vol->name_len + 1, GFP_KERNEL);
+        if (!mtd->name)
+                return -ENOMEM;
+        mtd->type = MTD_UBIVOLUME;
+        if (!ubi->ro_mode)
+                mtd->flags = MTD_WRITEABLE;
+        mtd->writesize  = ubi->min_io_size;
+        mtd->owner      = THIS_MODULE;
+        mtd->size       = vol->usable_leb_size * vol->reserved_pebs;
+        mtd->erasesize  = vol->usable_leb_size;
+        mtd->read       = gluebi_read;
+        mtd->write      = gluebi_write;
+        mtd->erase      = gluebi_erase;
+        mtd->get_device = gluebi_get_device;
+        mtd->put_device = gluebi_put_device;
+        if (add_mtd_device(mtd)) {
+                ubi_err("cannot not add MTD device\n");
+                kfree(mtd->name);
+                return -ENFILE;
+        }
+        dbg_msg("added mtd%d (\"%s\"), size %u, EB size %u",
+                mtd->index, mtd->name, mtd->size, mtd->erasesize);
+        return 0;
+}
+/**
+ * ubi_destroy_gluebi - close gluebi for an UBI volume.
+ * @vol: volume description object
+ *
+ * This function is called when an UBI volume is removed in order to remove
+ * corresponding fake MTD device. Returns zero in case of success and a
+ * negative error code in case of failure.
+ */
+int ubi_destroy_gluebi(struct ubi_volume *vol)
+{
+        int err;
+        struct mtd_info *mtd = &vol->gluebi_mtd;
+        dbg_msg("remove mtd%d", mtd->index);
+        err = del_mtd_device(mtd);
+        if (err)
+                return err;
+        kfree(mtd->name);
+        return 0;
+}
diff --git a/drivers/mtd/ubi/io.c b/drivers/mtd/ubi/io.c
new file mode 100644
index 000000000000..438914d05151
--- /dev/null
+++ b/drivers/mtd/ubi/io.c
@@ -0,0 +1,1259 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2006
+ * Copyright (c) Nokia Corporation, 2006, 2007
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Artem Bityutskiy (Битюцкий Артём)
+ */
+/*
+ * UBI input/output unit.
+ *
+ * This unit provides a uniform way to work with all kinds of the underlying
+ * MTD devices. It also implements handy functions for reading and writing UBI
+ * headers.
+ *
+ * We are trying to have a paranoid mindset and not to trust to what we read
+ * from the flash media in order to be more secure and robust. So this unit
+ * validates every single header it reads from the flash media.
+ *
+ * Some words about how the eraseblock headers are stored.
+ *
+ * The erase counter header is always stored at offset zero. By default, the
+ * VID header is stored after the EC header at the closest aligned offset
+ * (i.e. aligned to the minimum I/O unit size). Data starts next to the VID
+ * header at the closest aligned offset. But this default layout may be
+ * changed. For example, for different reasons (e.g., optimization) UBI may be
+ * asked to put the VID header at further offset, and even at an unaligned
+ * offset. Of course, if the offset of the VID header is unaligned, UBI adds
+ * proper padding in front of it. Data offset may also be changed but it has to
+ * be aligned.
+ *
+ * About minimal I/O units. In general, UBI assumes flash device model where
+ * there is only one minimal I/O unit size. E.g., in case of NOR flash it is 1,
+ * in case of NAND flash it is a NAND page, etc. This is reported by MTD in the
+ * @ubi->mtd->writesize field. But as an exception, UBI admits of using another
+ * (smaller) minimal I/O unit size for EC and VID headers to make it possible
+ * to do different optimizations.
+ *
+ * This is extremely useful in case of NAND flashes which admit of several
+ * write operations to one NAND page. In this case UBI can fit EC and VID
+ * headers at one NAND page. Thus, UBI may use "sub-page" size as the minimal
+ * I/O unit for the headers (the @ubi->hdrs_min_io_size field). But it still
+ * reports NAND page size (@ubi->min_io_size) as a minimal I/O unit for the UBI
+ * users.
+ *
+ * Example: some Samsung NANDs with 2KiB pages allow 4x 512-byte writes, so
+ * although the minimal I/O unit is 2K, UBI uses 512 bytes for EC and VID
+ * headers.
+ *
+ * Q: why not just to treat sub-page as a minimal I/O unit of this flash
+ * device, e.g., make @ubi->min_io_size = 512 in the example above?
+ *
+ * A: because when writing a sub-page, MTD still writes a full 2K page but the
+ * bytes which are no relevant to the sub-page are 0xFF. So, basically, writing
+ * 4x512 sub-pages is 4 times slower then writing one 2KiB NAND page. Thus, we
+ * prefer to use sub-pages only for EV and VID headers.
+ *
+ * As it was noted above, the VID header may start at a non-aligned offset.
+ * For example, in case of a 2KiB page NAND flash with a 512 bytes sub-page,
+ * the VID header may reside at offset 1984 which is the last 64 bytes of the
+ * last sub-page (EC header is always at offset zero). This causes some
+ * difficulties when reading and writing VID headers.
+ *
+ * Suppose we have a 64-byte buffer and we read a VID header at it. We change
+ * the data and want to write this VID header out. As we can only write in
+ * 512-byte chunks, we have to allocate one more buffer and copy our VID header
+ * to offset 448 of this buffer.
+ *
+ * The I/O unit does the following trick in order to avoid this extra copy.
+ * It always allocates a @ubi->vid_hdr_alsize bytes buffer for the VID header
+ * and returns a pointer to offset @ubi->vid_hdr_shift of this buffer. When the
+ * VID header is being written out, it shifts the VID header pointer back and
+ * writes the whole sub-page.
+ */
+#include <linux/crc32.h>
+#include <linux/err.h>
+#include "ubi.h"
+#ifdef CONFIG_MTD_UBI_DEBUG_PARANOID
+static int paranoid_check_not_bad(const struct ubi_device *ubi, int pnum);
+static int paranoid_check_peb_ec_hdr(const struct ubi_device *ubi, int pnum);
+static int paranoid_check_ec_hdr(const struct ubi_device *ubi, int pnum,
+                                 const struct ubi_ec_hdr *ec_hdr);
+static int paranoid_check_peb_vid_hdr(const struct ubi_device *ubi, int pnum);
+static int paranoid_check_vid_hdr(const struct ubi_device *ubi, int pnum,
+                                  const struct ubi_vid_hdr *vid_hdr);
+static int paranoid_check_all_ff(const struct ubi_device *ubi, int pnum,
+                                 int offset, int len);
+#else
+#define paranoid_check_not_bad(ubi, pnum) 0
+#define paranoid_check_peb_ec_hdr(ubi, pnum)  0
+#define paranoid_check_ec_hdr(ubi, pnum, ec_hdr)  0
+#define paranoid_check_peb_vid_hdr(ubi, pnum) 0
+#define paranoid_check_vid_hdr(ubi, pnum, vid_hdr) 0
+#define paranoid_check_all_ff(ubi, pnum, offset, len) 0
+#endif
+/**
+ * ubi_io_read - read data from a physical eraseblock.
+ * @ubi: UBI device description object
+ * @buf: buffer where to store the read data
+ * @pnum: physical eraseblock number to read from
+ * @offset: offset within the physical eraseblock from where to read
+ * @len: how many bytes to read
+ *
+ * This function reads data from offset @offset of physical eraseblock @pnum
+ * and stores the read data in the @buf buffer. The following return codes are
+ * possible:
+ *
+ * o %0 if all the requested data were successfully read;
+ * o %UBI_IO_BITFLIPS if all the requested data were successfully read, but
+ *   correctable bit-flips were detected; this is harmless but may indicate
+ *   that this eraseblock may become bad soon (but do not have to);
+ * o %-EBADMSG if the MTD subsystem reported about data data integrity
+ *   problems, for example it can me an ECC error in case of NAND; this most
+ *   probably means that the data is corrupted;
+ * o %-EIO if some I/O error occurred;
+ * o other negative error codes in case of other errors.
+ */
+int ubi_io_read(const struct ubi_device *ubi, void *buf, int pnum, int offset,
+                int len)
+{
+        int err, retries = 0;
+        size_t read;
+        loff_t addr;
+        dbg_io("read %d bytes from PEB %d:%d", len, pnum, offset);
+        ubi_assert(pnum >= 0 && pnum < ubi->peb_count);
+        ubi_assert(offset >= 0 && offset + len <= ubi->peb_size);
+        ubi_assert(len > 0);
+        err = paranoid_check_not_bad(ubi, pnum);
+        if (err)
+                return err > 0 ? -EINVAL : err;
+        addr = (loff_t)pnum * ubi->peb_size + offset;
+retry:
+        err = ubi->mtd->read(ubi->mtd, addr, len, &read, buf);
+        if (err) {
+                if (err == -EUCLEAN) {
+                        /*
+                         * -EUCLEAN is reported if there was a bit-flip which
+                         * was corrected, so this is harmless.
+                         */
+                        ubi_msg("fixable bit-flip detected at PEB %d", pnum);
+                        ubi_assert(len == read);
+                        return UBI_IO_BITFLIPS;
+                }
+                if (read != len && retries++ < UBI_IO_RETRIES) {
+                        dbg_io("error %d while reading %d bytes from PEB %d:%d, "
+                               "read only %zd bytes, retry",
+                               err, len, pnum, offset, read);
+                        yield();
+                        goto retry;
+                }
+                ubi_err("error %d while reading %d bytes from PEB %d:%d, "
+                        "read %zd bytes", err, len, pnum, offset, read);
+                ubi_dbg_dump_stack();
+        } else {
+                ubi_assert(len == read);
+                if (ubi_dbg_is_bitflip()) {
+                        dbg_msg("bit-flip (emulated)");
+                        err = UBI_IO_BITFLIPS;
+                }
+        }
+        return err;
+}
+/**
+ * ubi_io_write - write data to a physical eraseblock.
+ * @ubi: UBI device description object
+ * @buf: buffer with the data to write
+ * @pnum: physical eraseblock number to write to
+ * @offset: offset within the physical eraseblock where to write
+ * @len: how many bytes to write
+ *
+ * This function writes @len bytes of data from buffer @buf to offset @offset
+ * of physical eraseblock @pnum. If all the data were successfully written,
+ * zero is returned. If an error occurred, this function returns a negative
+ * error code. If %-EIO is returned, the physical eraseblock most probably went
+ * bad.
+ *
+ * Note, in case of an error, it is possible that something was still written
+ * to the flash media, but may be some garbage.
+ */
+int ubi_io_write(const struct ubi_device *ubi, const void *buf, int pnum,
+                 int offset, int len)
+{
+        int err;
+        size_t written;
+        loff_t addr;
+        dbg_io("write %d bytes to PEB %d:%d", len, pnum, offset);
+        ubi_assert(pnum >= 0 && pnum < ubi->peb_count);
+        ubi_assert(offset >= 0 && offset + len <= ubi->peb_size);
+        ubi_assert(offset % ubi->hdrs_min_io_size == 0);
+        ubi_assert(len > 0 && len % ubi->hdrs_min_io_size == 0);
+        if (ubi->ro_mode) {
+                ubi_err("read-only mode");
+                return -EROFS;
+        }
+        /* The below has to be compiled out if paranoid checks are disabled */
+        err = paranoid_check_not_bad(ubi, pnum);
+        if (err)
+                return err > 0 ? -EINVAL : err;
+        /* The area we are writing to has to contain all 0xFF bytes */
+        err = paranoid_check_all_ff(ubi, pnum, offset, len);
+        if (err)
+                return err > 0 ? -EINVAL : err;
+        if (offset >= ubi->leb_start) {
+                /*
+                 * We write to the data area of the physical eraseblock. Make
+                 * sure it has valid EC and VID headers.
+                 */
+                err = paranoid_check_peb_ec_hdr(ubi, pnum);
+                if (err)
+                        return err > 0 ? -EINVAL : err;
+                err = paranoid_check_peb_vid_hdr(ubi, pnum);
+                if (err)
+                        return err > 0 ? -EINVAL : err;
+        }
+        if (ubi_dbg_is_write_failure()) {
+                dbg_err("cannot write %d bytes to PEB %d:%d "
+                        "(emulated)", len, pnum, offset);
+                ubi_dbg_dump_stack();
+                return -EIO;
+        }
+        addr = (loff_t)pnum * ubi->peb_size + offset;
+        err = ubi->mtd->write(ubi->mtd, addr, len, &written, buf);
+        if (err) {
+                ubi_err("error %d while writing %d bytes to PEB %d:%d, written"
+                        " %zd bytes", err, len, pnum, offset, written);
+                ubi_dbg_dump_stack();
+        } else
+                ubi_assert(written == len);
+        return err;
+}
+/**
+ * erase_callback - MTD erasure call-back.
+ * @ei: MTD erase information object.
+ *
+ * Note, even though MTD erase interface is asynchronous, all the current
+ * implementations are synchronous anyway.
+ */
+static void erase_callback(struct erase_info *ei)
+{
+        wake_up_interruptible((wait_queue_head_t *)ei->priv);
+}
+/**
+ * do_sync_erase - synchronously erase a physical eraseblock.
+ * @ubi: UBI device description object
+ * @pnum: the physical eraseblock number to erase
+ *
+ * This function synchronously erases physical eraseblock @pnum and returns
+ * zero in case of success and a negative error code in case of failure. If
+ * %-EIO is returned, the physical eraseblock most probably went bad.
+ */
+static int do_sync_erase(const struct ubi_device *ubi, int pnum)
+{
+        int err, retries = 0;
+        struct erase_info ei;
+        wait_queue_head_t wq;
+        dbg_io("erase PEB %d", pnum);
+retry:
+        init_waitqueue_head(&wq);
+        memset(&ei, 0, sizeof(struct erase_info));
+        ei.mtd      = ubi->mtd;
+        ei.addr     = pnum * ubi->peb_size;
+        ei.len      = ubi->peb_size;
+        ei.callback = erase_callback;
+        ei.priv     = (unsigned long)&wq;
+        err = ubi->mtd->erase(ubi->mtd, &ei);
+        if (err) {
+                if (retries++ < UBI_IO_RETRIES) {
+                        dbg_io("error %d while erasing PEB %d, retry",
+                               err, pnum);
+                        yield();
+                        goto retry;
+                }
+                ubi_err("cannot erase PEB %d, error %d", pnum, err);
+                ubi_dbg_dump_stack();
+                return err;
+        }
+        err = wait_event_interruptible(wq, ei.state == MTD_ERASE_DONE ||
+                                           ei.state == MTD_ERASE_FAILED);
+        if (err) {
+                ubi_err("interrupted PEB %d erasure", pnum);
+                return -EINTR;
+        }
+        if (ei.state == MTD_ERASE_FAILED) {
+                if (retries++ < UBI_IO_RETRIES) {
+                        dbg_io("error while erasing PEB %d, retry", pnum);
+                        yield();
+                        goto retry;
+                }
+                ubi_err("cannot erase PEB %d", pnum);
+                ubi_dbg_dump_stack();
+                return -EIO;
+        }
+        err = paranoid_check_all_ff(ubi, pnum, 0, ubi->peb_size);
+        if (err)
+                return err > 0 ? -EINVAL : err;
+        if (ubi_dbg_is_erase_failure() && !err) {
+                dbg_err("cannot erase PEB %d (emulated)", pnum);
+                return -EIO;
+        }
+        return 0;
+}
+/**
+ * check_pattern - check if buffer contains only a certain byte pattern.
+ * @buf: buffer to check
+ * @patt: the pattern to check
+ * @size: buffer size in bytes
+ *
+ * This function returns %1 in there are only @patt bytes in @buf, and %0 if
+ * something else was also found.
+ */
+static int check_pattern(const void *buf, uint8_t patt, int size)
+{
+        int i;
+        for (i = 0; i < size; i++)
+                if (((const uint8_t *)buf)[i] != patt)
+                        return 0;
+        return 1;
+}
+/* Patterns to write to a physical eraseblock when torturing it */
+static uint8_t patterns[] = {0xa5, 0x5a, 0x0};
+/**
+ * torture_peb - test a supposedly bad physical eraseblock.
+ * @ubi: UBI device description object
+ * @pnum: the physical eraseblock number to test
+ *
+ * This function returns %-EIO if the physical eraseblock did not pass the
+ * test, a positive number of erase operations done if the test was
+ * successfully passed, and other negative error codes in case of other errors.
+ */
+static int torture_peb(const struct ubi_device *ubi, int pnum)
+{
+        void *buf;
+        int err, i, patt_count;
+        buf = kmalloc(ubi->peb_size, GFP_KERNEL);
+        if (!buf)
+                return -ENOMEM;
+        patt_count = ARRAY_SIZE(patterns);
+        ubi_assert(patt_count > 0);
+        for (i = 0; i < patt_count; i++) {
+                err = do_sync_erase(ubi, pnum);
+                if (err)
+                        goto out;
+                /* Make sure the PEB contains only 0xFF bytes */
+                err = ubi_io_read(ubi, buf, pnum, 0, ubi->peb_size);
+                if (err)
+                        goto out;
+                err = check_pattern(buf, 0xFF, ubi->peb_size);
+                if (err == 0) {
+                        ubi_err("erased PEB %d, but a non-0xFF byte found",
+                                pnum);
+                        err = -EIO;
+                        goto out;
+                }
+                /* Write a pattern and check it */
+                memset(buf, patterns[i], ubi->peb_size);
+                err = ubi_io_write(ubi, buf, pnum, 0, ubi->peb_size);
+                if (err)
+                        goto out;
+                memset(buf, ~patterns[i], ubi->peb_size);
+                err = ubi_io_read(ubi, buf, pnum, 0, ubi->peb_size);
+                if (err)
+                        goto out;
+                err = check_pattern(buf, patterns[i], ubi->peb_size);
+                if (err == 0) {
+                        ubi_err("pattern %x checking failed for PEB %d",
+                                patterns[i], pnum);
+                        err = -EIO;
+                        goto out;
+                }
+        }
+        err = patt_count;
+out:
+        if (err == UBI_IO_BITFLIPS || err == -EBADMSG)
+                /*
+                 * If a bit-flip or data integrity error was detected, the test
+                 * has not passed because it happened on a freshly erased
+                 * physical eraseblock which means something is wrong with it.
+                 */
+                err = -EIO;
+        kfree(buf);
+        return err;
+}
+/**
+ * ubi_io_sync_erase - synchronously erase a physical eraseblock.
+ * @ubi: UBI device description object
+ * @pnum: physical eraseblock number to erase
+ * @torture: if this physical eraseblock has to be tortured
+ *
+ * This function synchronously erases physical eraseblock @pnum. If @torture
+ * flag is not zero, the physical eraseblock is checked by means of writing
+ * different patterns to it and reading them back. If the torturing is enabled,
+ * the physical eraseblock is erased more then once.
+ *
+ * This function returns the number of erasures made in case of success, %-EIO
+ * if the erasure failed or the torturing test failed, and other negative error
+ * codes in case of other errors. Note, %-EIO means that the physical
+ * eraseblock is bad.
+ */
+int ubi_io_sync_erase(const struct ubi_device *ubi, int pnum, int torture)
+{
+        int err, ret = 0;
+        ubi_assert(pnum >= 0 && pnum < ubi->peb_count);
+        err = paranoid_check_not_bad(ubi, pnum);
+        if (err != 0)
+                return err > 0 ? -EINVAL : err;
+        if (ubi->ro_mode) {
+                ubi_err("read-only mode");
+                return -EROFS;
+        }
+        if (torture) {
+                ret = torture_peb(ubi, pnum);
+                if (ret < 0)
+                        return ret;
+        }
+        err = do_sync_erase(ubi, pnum);
+        if (err)
+                return err;
+        return ret + 1;
+}
+/**
+ * ubi_io_is_bad - check if a physical eraseblock is bad.
+ * @ubi: UBI device description object
+ * @pnum: the physical eraseblock number to check
+ *
+ * This function returns a positive number if the physical eraseblock is bad,
+ * zero if not, and a negative error code if an error occurred.
+ */
+int ubi_io_is_bad(const struct ubi_device *ubi, int pnum)
+{
+        struct mtd_info *mtd = ubi->mtd;
+        ubi_assert(pnum >= 0 && pnum < ubi->peb_count);
+        if (ubi->bad_allowed) {
+                int ret;
+                ret = mtd->block_isbad(mtd, (loff_t)pnum * ubi->peb_size);
+                if (ret < 0)
+                        ubi_err("error %d while checking if PEB %d is bad",
+                                ret, pnum);
+                else if (ret)
+                        dbg_io("PEB %d is bad", pnum);
+                return ret;
+        }
+        return 0;
+}
+/**
+ * ubi_io_mark_bad - mark a physical eraseblock as bad.
+ * @ubi: UBI device description object
+ * @pnum: the physical eraseblock number to mark
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+int ubi_io_mark_bad(const struct ubi_device *ubi, int pnum)
+{
+        int err;
+        struct mtd_info *mtd = ubi->mtd;
+        ubi_assert(pnum >= 0 && pnum < ubi->peb_count);
+        if (ubi->ro_mode) {
+                ubi_err("read-only mode");
+                return -EROFS;
+        }
+        if (!ubi->bad_allowed)
+                return 0;
+        err = mtd->block_markbad(mtd, (loff_t)pnum * ubi->peb_size);
+        if (err)
+                ubi_err("cannot mark PEB %d bad, error %d", pnum, err);
+        return err;
+}
+/**
+ * validate_ec_hdr - validate an erase counter header.
+ * @ubi: UBI device description object
+ * @ec_hdr: the erase counter header to check
+ *
+ * This function returns zero if the erase counter header is OK, and %1 if
+ * not.
+ */
+static int validate_ec_hdr(const struct ubi_device *ubi,
+                           const struct ubi_ec_hdr *ec_hdr)
+{
+        long long ec;
+        int vid_hdr_offset, leb_start;
+        ec = ubi64_to_cpu(ec_hdr->ec);
+        vid_hdr_offset = ubi32_to_cpu(ec_hdr->vid_hdr_offset);
+        leb_start = ubi32_to_cpu(ec_hdr->data_offset);
+        if (ec_hdr->version != UBI_VERSION) {
+                ubi_err("node with incompatible UBI version found: "
+                        "this UBI version is %d, image version is %d",
+                        UBI_VERSION, (int)ec_hdr->version);
+                goto bad;
+        }
+        if (vid_hdr_offset != ubi->vid_hdr_offset) {
+                ubi_err("bad VID header offset %d, expected %d",
+                        vid_hdr_offset, ubi->vid_hdr_offset);
+                goto bad;
+        }
+        if (leb_start != ubi->leb_start) {
+                ubi_err("bad data offset %d, expected %d",
+                        leb_start, ubi->leb_start);
+                goto bad;
+        }
+        if (ec < 0 || ec > UBI_MAX_ERASECOUNTER) {
+                ubi_err("bad erase counter %lld", ec);
+                goto bad;
+        }
+        return 0;
+bad:
+        ubi_err("bad EC header");
+        ubi_dbg_dump_ec_hdr(ec_hdr);
+        ubi_dbg_dump_stack();
+        return 1;
+}
+/**
+ * ubi_io_read_ec_hdr - read and check an erase counter header.
+ * @ubi: UBI device description object
+ * @pnum: physical eraseblock to read from
+ * @ec_hdr: a &struct ubi_ec_hdr object where to store the read erase counter
+ * header
+ * @verbose: be verbose if the header is corrupted or was not found
+ *
+ * This function reads erase counter header from physical eraseblock @pnum and
+ * stores it in @ec_hdr. This function also checks CRC checksum of the read
+ * erase counter header. The following codes may be returned:
+ *
+ * o %0 if the CRC checksum is correct and the header was successfully read;
+ * o %UBI_IO_BITFLIPS if the CRC is correct, but bit-flips were detected
+ *   and corrected by the flash driver; this is harmless but may indicate that
+ *   this eraseblock may become bad soon (but may be not);
+ * o %UBI_IO_BAD_EC_HDR if the erase counter header is corrupted (a CRC error);
+ * o %UBI_IO_PEB_EMPTY if the physical eraseblock is empty;
+ * o a negative error code in case of failure.
+ */
+int ubi_io_read_ec_hdr(const struct ubi_device *ubi, int pnum,
+                       struct ubi_ec_hdr *ec_hdr, int verbose)
+{
+        int err, read_err = 0;
+        uint32_t crc, magic, hdr_crc;
+        dbg_io("read EC header from PEB %d", pnum);
+        ubi_assert(pnum >= 0 && pnum < ubi->peb_count);
+        err = ubi_io_read(ubi, ec_hdr, pnum, 0, UBI_EC_HDR_SIZE);
+        if (err) {
+                if (err != UBI_IO_BITFLIPS && err != -EBADMSG)
+                        return err;
+                /*
+                 * We read all the data, but either a correctable bit-flip
+                 * occurred, or MTD reported about some data integrity error,
+                 * like an ECC error in case of NAND. The former is harmless,
+                 * the later may mean that the read data is corrupted. But we
+                 * have a CRC check-sum and we will detect this. If the EC
+                 * header is still OK, we just report this as there was a
+                 * bit-flip.
+                 */
+                read_err = err;
+        }
+        magic = ubi32_to_cpu(ec_hdr->magic);
+        if (magic != UBI_EC_HDR_MAGIC) {
+                /*
+                 * The magic field is wrong. Let's check if we have read all
+                 * 0xFF. If yes, this physical eraseblock is assumed to be
+                 * empty.
+                 *
+                 * But if there was a read error, we do not test it for all
+                 * 0xFFs. Even if it does contain all 0xFFs, this error
+                 * indicates that something is still wrong with this physical
+                 * eraseblock and we anyway cannot treat it as empty.
+                 */
+                if (read_err != -EBADMSG &&
+                    check_pattern(ec_hdr, 0xFF, UBI_EC_HDR_SIZE)) {
+                        /* The physical eraseblock is supposedly empty */
+                        /*
+                         * The below is just a paranoid check, it has to be
+                         * compiled out if paranoid checks are disabled.
+                         */
+                        err = paranoid_check_all_ff(ubi, pnum, 0,
+                                                    ubi->peb_size);
+                        if (err)
+                                return err > 0 ? UBI_IO_BAD_EC_HDR : err;
+                        if (verbose)
+                                ubi_warn("no EC header found at PEB %d, "
+                                         "only 0xFF bytes", pnum);
+                        return UBI_IO_PEB_EMPTY;
+                }
+                /*
+                 * This is not a valid erase counter header, and these are not
+                 * 0xFF bytes. Report that the header is corrupted.
+                 */
+                if (verbose) {
+                        ubi_warn("bad magic number at PEB %d: %08x instead of "
+                                 "%08x", pnum, magic, UBI_EC_HDR_MAGIC);
+                        ubi_dbg_dump_ec_hdr(ec_hdr);
+                }
+                return UBI_IO_BAD_EC_HDR;
+        }
+        crc = crc32(UBI_CRC32_INIT, ec_hdr, UBI_EC_HDR_SIZE_CRC);
+        hdr_crc = ubi32_to_cpu(ec_hdr->hdr_crc);
+        if (hdr_crc != crc) {
+                if (verbose) {
+                        ubi_warn("bad EC header CRC at PEB %d, calculated %#08x,"
+                                 " read %#08x", pnum, crc, hdr_crc);
+                        ubi_dbg_dump_ec_hdr(ec_hdr);
+                }
+                return UBI_IO_BAD_EC_HDR;
+        }
+        /* And of course validate what has just been read from the media */
+        err = validate_ec_hdr(ubi, ec_hdr);
+        if (err) {
+                ubi_err("validation failed for PEB %d", pnum);
+                return -EINVAL;
+        }
+        return read_err ? UBI_IO_BITFLIPS : 0;
+}
+/**
+ * ubi_io_write_ec_hdr - write an erase counter header.
+ * @ubi: UBI device description object
+ * @pnum: physical eraseblock to write to
+ * @ec_hdr: the erase counter header to write
+ *
+ * This function writes erase counter header described by @ec_hdr to physical
+ * eraseblock @pnum. It also fills most fields of @ec_hdr before writing, so
+ * the caller do not have to fill them. Callers must only fill the @ec_hdr->ec
+ * field.
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure. If %-EIO is returned, the physical eraseblock most probably
+ * went bad.
+ */
+int ubi_io_write_ec_hdr(const struct ubi_device *ubi, int pnum,
+                        struct ubi_ec_hdr *ec_hdr)
+{
+        int err;
+        uint32_t crc;
+        dbg_io("write EC header to PEB %d", pnum);
+        ubi_assert(pnum >= 0 &&  pnum < ubi->peb_count);
+        ec_hdr->magic = cpu_to_ubi32(UBI_EC_HDR_MAGIC);
+        ec_hdr->version = UBI_VERSION;
+        ec_hdr->vid_hdr_offset = cpu_to_ubi32(ubi->vid_hdr_offset);
+        ec_hdr->data_offset = cpu_to_ubi32(ubi->leb_start);
+        crc = crc32(UBI_CRC32_INIT, ec_hdr, UBI_EC_HDR_SIZE_CRC);
+        ec_hdr->hdr_crc = cpu_to_ubi32(crc);
+        err = paranoid_check_ec_hdr(ubi, pnum, ec_hdr);
+        if (err)
+                return -EINVAL;
+        err = ubi_io_write(ubi, ec_hdr, pnum, 0, ubi->ec_hdr_alsize);
+        return err;
+}
+/**
+ * validate_vid_hdr - validate a volume identifier header.
+ * @ubi: UBI device description object
+ * @vid_hdr: the volume identifier header to check
+ *
+ * This function checks that data stored in the volume identifier header
+ * @vid_hdr. Returns zero if the VID header is OK and %1 if not.
+ */
+static int validate_vid_hdr(const struct ubi_device *ubi,
+                            const struct ubi_vid_hdr *vid_hdr)
+{
+        int vol_type = vid_hdr->vol_type;
+        int copy_flag = vid_hdr->copy_flag;
+        int vol_id = ubi32_to_cpu(vid_hdr->vol_id);
+        int lnum = ubi32_to_cpu(vid_hdr->lnum);
+        int compat = vid_hdr->compat;
+        int data_size = ubi32_to_cpu(vid_hdr->data_size);
+        int used_ebs = ubi32_to_cpu(vid_hdr->used_ebs);
+        int data_pad = ubi32_to_cpu(vid_hdr->data_pad);
+        int data_crc = ubi32_to_cpu(vid_hdr->data_crc);
+        int usable_leb_size = ubi->leb_size - data_pad;
+        if (copy_flag != 0 && copy_flag != 1) {
+                dbg_err("bad copy_flag");
+                goto bad;
+        }
+        if (vol_id < 0 || lnum < 0 || data_size < 0 || used_ebs < 0 ||
+            data_pad < 0) {
+                dbg_err("negative values");
+                goto bad;
+        }
+        if (vol_id >= UBI_MAX_VOLUMES && vol_id < UBI_INTERNAL_VOL_START) {
+                dbg_err("bad vol_id");
+                goto bad;
+        }
+        if (vol_id < UBI_INTERNAL_VOL_START && compat != 0) {
+                dbg_err("bad compat");
+                goto bad;
+        }
+        if (vol_id >= UBI_INTERNAL_VOL_START && compat != UBI_COMPAT_DELETE &&
+            compat != UBI_COMPAT_RO && compat != UBI_COMPAT_PRESERVE &&
+            compat != UBI_COMPAT_REJECT) {
+                dbg_err("bad compat");
+                goto bad;
+        }
+        if (vol_type != UBI_VID_DYNAMIC && vol_type != UBI_VID_STATIC) {
+                dbg_err("bad vol_type");
+                goto bad;
+        }
+        if (data_pad >= ubi->leb_size / 2) {
+                dbg_err("bad data_pad");
+                goto bad;
+        }
+        if (vol_type == UBI_VID_STATIC) {
+                /*
+                 * Although from high-level point of view static volumes may
+                 * contain zero bytes of data, but no VID headers can contain
+                 * zero at these fields, because they empty volumes do not have
+                 * mapped logical eraseblocks.
+                 */
+                if (used_ebs == 0) {
+                        dbg_err("zero used_ebs");
+                        goto bad;
+                }
+                if (data_size == 0) {
+                        dbg_err("zero data_size");
+                        goto bad;
+                }
+                if (lnum < used_ebs - 1) {
+                        if (data_size != usable_leb_size) {
+                                dbg_err("bad data_size");
+                                goto bad;
+                        }
+                } else if (lnum == used_ebs - 1) {
+                        if (data_size == 0) {
+                                dbg_err("bad data_size at last LEB");
+                                goto bad;
+                        }
+                } else {
+                        dbg_err("too high lnum");
+                        goto bad;
+                }
+        } else {
+                if (copy_flag == 0) {
+                        if (data_crc != 0) {
+                                dbg_err("non-zero data CRC");
+                                goto bad;
+                        }
+                        if (data_size != 0) {
+                                dbg_err("non-zero data_size");
+                                goto bad;
+                        }
+                } else {
+                        if (data_size == 0) {
+                                dbg_err("zero data_size of copy");
+                                goto bad;
+                        }
+                }
+                if (used_ebs != 0) {
+                        dbg_err("bad used_ebs");
+                        goto bad;
+                }
+        }
+        return 0;
+bad:
+        ubi_err("bad VID header");
+        ubi_dbg_dump_vid_hdr(vid_hdr);
+        ubi_dbg_dump_stack();
+        return 1;
+}
+/**
+ * ubi_io_read_vid_hdr - read and check a volume identifier header.
+ * @ubi: UBI device description object
+ * @pnum: physical eraseblock number to read from
+ * @vid_hdr: &struct ubi_vid_hdr object where to store the read volume
+ * identifier header
+ * @verbose: be verbose if the header is corrupted or wasn't found
+ *
+ * This function reads the volume identifier header from physical eraseblock
+ * @pnum and stores it in @vid_hdr. It also checks CRC checksum of the read
+ * volume identifier header. The following codes may be returned:
+ *
+ * o %0 if the CRC checksum is correct and the header was successfully read;
+ * o %UBI_IO_BITFLIPS if the CRC is correct, but bit-flips were detected
+ *   and corrected by the flash driver; this is harmless but may indicate that
+ *   this eraseblock may become bad soon;
+ * o %UBI_IO_BAD_VID_HRD if the volume identifier header is corrupted (a CRC
+ *   error detected);
+ * o %UBI_IO_PEB_FREE if the physical eraseblock is free (i.e., there is no VID
+ *   header there);
+ * o a negative error code in case of failure.
+ */
+int ubi_io_read_vid_hdr(const struct ubi_device *ubi, int pnum,
+                        struct ubi_vid_hdr *vid_hdr, int verbose)
+{
+        int err, read_err = 0;
+        uint32_t crc, magic, hdr_crc;
+        void *p;
+        dbg_io("read VID header from PEB %d", pnum);
+        ubi_assert(pnum >= 0 &&  pnum < ubi->peb_count);
+        p = (char *)vid_hdr - ubi->vid_hdr_shift;
+        err = ubi_io_read(ubi, p, pnum, ubi->vid_hdr_aloffset,
+                          ubi->vid_hdr_alsize);
+        if (err) {
+                if (err != UBI_IO_BITFLIPS && err != -EBADMSG)
+                        return err;
+                /*
+                 * We read all the data, but either a correctable bit-flip
+                 * occurred, or MTD reported about some data integrity error,
+                 * like an ECC error in case of NAND. The former is harmless,
+                 * the later may mean the read data is corrupted. But we have a
+                 * CRC check-sum and we will identify this. If the VID header is
+                 * still OK, we just report this as there was a bit-flip.
+                 */
+                read_err = err;
+        }
+        magic = ubi32_to_cpu(vid_hdr->magic);
+        if (magic != UBI_VID_HDR_MAGIC) {
+                /*
+                 * If we have read all 0xFF bytes, the VID header probably does
+                 * not exist and the physical eraseblock is assumed to be free.
+                 *
+                 * But if there was a read error, we do not test the data for
+                 * 0xFFs. Even if it does contain all 0xFFs, this error
+                 * indicates that something is still wrong with this physical
+                 * eraseblock and it cannot be regarded as free.
+                 */
+                if (read_err != -EBADMSG &&
+                    check_pattern(vid_hdr, 0xFF, UBI_VID_HDR_SIZE)) {
+                        /* The physical eraseblock is supposedly free */
+                        /*
+                         * The below is just a paranoid check, it has to be
+                         * compiled out if paranoid checks are disabled.
+                         */
+                        err = paranoid_check_all_ff(ubi, pnum, ubi->leb_start,
+                                                    ubi->leb_size);
+                        if (err)
+                                return err > 0 ? UBI_IO_BAD_VID_HDR : err;
+                        if (verbose)
+                                ubi_warn("no VID header found at PEB %d, "
+                                         "only 0xFF bytes", pnum);
+                        return UBI_IO_PEB_FREE;
+                }
+                /*
+                 * This is not a valid VID header, and these are not 0xFF
+                 * bytes. Report that the header is corrupted.
+                 */
+                if (verbose) {
+                        ubi_warn("bad magic number at PEB %d: %08x instead of "
+                                 "%08x", pnum, magic, UBI_VID_HDR_MAGIC);
+                        ubi_dbg_dump_vid_hdr(vid_hdr);
+                }
+                return UBI_IO_BAD_VID_HDR;
+        }
+        crc = crc32(UBI_CRC32_INIT, vid_hdr, UBI_VID_HDR_SIZE_CRC);
+        hdr_crc = ubi32_to_cpu(vid_hdr->hdr_crc);
+        if (hdr_crc != crc) {
+                if (verbose) {
+                        ubi_warn("bad CRC at PEB %d, calculated %#08x, "
+                                 "read %#08x", pnum, crc, hdr_crc);
+                        ubi_dbg_dump_vid_hdr(vid_hdr);
+                }
+                return UBI_IO_BAD_VID_HDR;
+        }
+        /* Validate the VID header that we have just read */
+        err = validate_vid_hdr(ubi, vid_hdr);
+        if (err) {
+                ubi_err("validation failed for PEB %d", pnum);
+                return -EINVAL;
+        }
+        return read_err ? UBI_IO_BITFLIPS : 0;
+}
+/**
+ * ubi_io_write_vid_hdr - write a volume identifier header.
+ * @ubi: UBI device description object
+ * @pnum: the physical eraseblock number to write to
+ * @vid_hdr: the volume identifier header to write
+ *
+ * This function writes the volume identifier header described by @vid_hdr to
+ * physical eraseblock @pnum. This function automatically fills the
+ * @vid_hdr->magic and the @vid_hdr->version fields, as well as calculates
+ * header CRC checksum and stores it at vid_hdr->hdr_crc.
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure. If %-EIO is returned, the physical eraseblock probably went
+ * bad.
+ */
+int ubi_io_write_vid_hdr(const struct ubi_device *ubi, int pnum,
+                         struct ubi_vid_hdr *vid_hdr)
+{
+        int err;
+        uint32_t crc;
+        void *p;
+        dbg_io("write VID header to PEB %d", pnum);
+        ubi_assert(pnum >= 0 &&  pnum < ubi->peb_count);
+        err = paranoid_check_peb_ec_hdr(ubi, pnum);
+        if (err)
+                return err > 0 ? -EINVAL: err;
+        vid_hdr->magic = cpu_to_ubi32(UBI_VID_HDR_MAGIC);
+        vid_hdr->version = UBI_VERSION;
+        crc = crc32(UBI_CRC32_INIT, vid_hdr, UBI_VID_HDR_SIZE_CRC);
+        vid_hdr->hdr_crc = cpu_to_ubi32(crc);
+        err = paranoid_check_vid_hdr(ubi, pnum, vid_hdr);
+        if (err)
+                return -EINVAL;
+        p = (char *)vid_hdr - ubi->vid_hdr_shift;
+        err = ubi_io_write(ubi, p, pnum, ubi->vid_hdr_aloffset,
+                           ubi->vid_hdr_alsize);
+        return err;
+}
+#ifdef CONFIG_MTD_UBI_DEBUG_PARANOID
+/**
+ * paranoid_check_not_bad - ensure that a physical eraseblock is not bad.
+ * @ubi: UBI device description object
+ * @pnum: physical eraseblock number to check
+ *
+ * This function returns zero if the physical eraseblock is good, a positive
+ * number if it is bad and a negative error code if an error occurred.
+ */
+static int paranoid_check_not_bad(const struct ubi_device *ubi, int pnum)
+{
+        int err;
+        err = ubi_io_is_bad(ubi, pnum);
+        if (!err)
+                return err;
+        ubi_err("paranoid check failed for PEB %d", pnum);
+        ubi_dbg_dump_stack();
+        return err;
+}
+/**
+ * paranoid_check_ec_hdr - check if an erase counter header is all right.
+ * @ubi: UBI device description object
+ * @pnum: physical eraseblock number the erase counter header belongs to
+ * @ec_hdr: the erase counter header to check
+ *
+ * This function returns zero if the erase counter header contains valid
+ * values, and %1 if not.
+ */
+static int paranoid_check_ec_hdr(const struct ubi_device *ubi, int pnum,
+                                 const struct ubi_ec_hdr *ec_hdr)
+{
+        int err;
+        uint32_t magic;
+        magic = ubi32_to_cpu(ec_hdr->magic);
+        if (magic != UBI_EC_HDR_MAGIC) {
+                ubi_err("bad magic %#08x, must be %#08x",
+                        magic, UBI_EC_HDR_MAGIC);
+                goto fail;
+        }
+        err = validate_ec_hdr(ubi, ec_hdr);
+        if (err) {
+                ubi_err("paranoid check failed for PEB %d", pnum);
+                goto fail;
+        }
+        return 0;
+fail:
+        ubi_dbg_dump_ec_hdr(ec_hdr);
+        ubi_dbg_dump_stack();
+        return 1;
+}
+/**
+ * paranoid_check_peb_ec_hdr - check that the erase counter header of a
+ * physical eraseblock is in-place and is all right.
+ * @ubi: UBI device description object
+ * @pnum: the physical eraseblock number to check
+ *
+ * This function returns zero if the erase counter header is all right, %1 if
+ * not, and a negative error code if an error occurred.
+ */
+static int paranoid_check_peb_ec_hdr(const struct ubi_device *ubi, int pnum)
+{
+        int err;
+        uint32_t crc, hdr_crc;
+        struct ubi_ec_hdr *ec_hdr;
+        ec_hdr = kzalloc(ubi->ec_hdr_alsize, GFP_KERNEL);
+        if (!ec_hdr)
+                return -ENOMEM;
+        err = ubi_io_read(ubi, ec_hdr, pnum, 0, UBI_EC_HDR_SIZE);
+        if (err && err != UBI_IO_BITFLIPS && err != -EBADMSG)
+                goto exit;
+        crc = crc32(UBI_CRC32_INIT, ec_hdr, UBI_EC_HDR_SIZE_CRC);
+        hdr_crc = ubi32_to_cpu(ec_hdr->hdr_crc);
+        if (hdr_crc != crc) {
+                ubi_err("bad CRC, calculated %#08x, read %#08x", crc, hdr_crc);
+                ubi_err("paranoid check failed for PEB %d", pnum);
+                ubi_dbg_dump_ec_hdr(ec_hdr);
+                ubi_dbg_dump_stack();
+                err = 1;
+                goto exit;
+        }
+        err = paranoid_check_ec_hdr(ubi, pnum, ec_hdr);
+exit:
+        kfree(ec_hdr);
+        return err;
+}
+/**
+ * paranoid_check_vid_hdr - check that a volume identifier header is all right.
+ * @ubi: UBI device description object
+ * @pnum: physical eraseblock number the volume identifier header belongs to
+ * @vid_hdr: the volume identifier header to check
+ *
+ * This function returns zero if the volume identifier header is all right, and
+ * %1 if not.
+ */
+static int paranoid_check_vid_hdr(const struct ubi_device *ubi, int pnum,
+                                  const struct ubi_vid_hdr *vid_hdr)
+{
+        int err;
+        uint32_t magic;
+        magic = ubi32_to_cpu(vid_hdr->magic);
+        if (magic != UBI_VID_HDR_MAGIC) {
+                ubi_err("bad VID header magic %#08x at PEB %d, must be %#08x",
+                        magic, pnum, UBI_VID_HDR_MAGIC);
+                goto fail;
+        }
+        err = validate_vid_hdr(ubi, vid_hdr);
+        if (err) {
+                ubi_err("paranoid check failed for PEB %d", pnum);
+                goto fail;
+        }
+        return err;
+fail:
+        ubi_err("paranoid check failed for PEB %d", pnum);
+        ubi_dbg_dump_vid_hdr(vid_hdr);
+        ubi_dbg_dump_stack();
+        return 1;
+}
+/**
+ * paranoid_check_peb_vid_hdr - check that the volume identifier header of a
+ * physical eraseblock is in-place and is all right.
+ * @ubi: UBI device description object
+ * @pnum: the physical eraseblock number to check
+ *
+ * This function returns zero if the volume identifier header is all right,
+ * %1 if not, and a negative error code if an error occurred.
+ */
+static int paranoid_check_peb_vid_hdr(const struct ubi_device *ubi, int pnum)
+{
+        int err;
+        uint32_t crc, hdr_crc;
+        struct ubi_vid_hdr *vid_hdr;
+        void *p;
+        vid_hdr = ubi_zalloc_vid_hdr(ubi);
+        if (!vid_hdr)
+                return -ENOMEM;
+        p = (char *)vid_hdr - ubi->vid_hdr_shift;
+        err = ubi_io_read(ubi, p, pnum, ubi->vid_hdr_aloffset,
+                          ubi->vid_hdr_alsize);
+        if (err && err != UBI_IO_BITFLIPS && err != -EBADMSG)
+                goto exit;
+        crc = crc32(UBI_CRC32_INIT, vid_hdr, UBI_EC_HDR_SIZE_CRC);
+        hdr_crc = ubi32_to_cpu(vid_hdr->hdr_crc);
+        if (hdr_crc != crc) {
+                ubi_err("bad VID header CRC at PEB %d, calculated %#08x, "
+                        "read %#08x", pnum, crc, hdr_crc);
+                ubi_err("paranoid check failed for PEB %d", pnum);
+                ubi_dbg_dump_vid_hdr(vid_hdr);
+                ubi_dbg_dump_stack();
+                err = 1;
+                goto exit;
+        }
+        err = paranoid_check_vid_hdr(ubi, pnum, vid_hdr);
+exit:
+        ubi_free_vid_hdr(ubi, vid_hdr);
+        return err;
+}
+/**
+ * paranoid_check_all_ff - check that a region of flash is empty.
+ * @ubi: UBI device description object
+ * @pnum: the physical eraseblock number to check
+ * @offset: the starting offset within the physical eraseblock to check
+ * @len: the length of the region to check
+ *
+ * This function returns zero if only 0xFF bytes are present at offset
+ * @offset of the physical eraseblock @pnum, %1 if not, and a negative error
+ * code if an error occurred.
+ */
+static int paranoid_check_all_ff(const struct ubi_device *ubi, int pnum,
+                                 int offset, int len)
+{
+        size_t read;
+        int err;
+        void *buf;
+        loff_t addr = (loff_t)pnum * ubi->peb_size + offset;
+        buf = kzalloc(len, GFP_KERNEL);
+        if (!buf)
+                return -ENOMEM;
+        err = ubi->mtd->read(ubi->mtd, addr, len, &read, buf);
+        if (err && err != -EUCLEAN) {
+                ubi_err("error %d while reading %d bytes from PEB %d:%d, "
+                        "read %zd bytes", err, len, pnum, offset, read);
+                goto error;
+        }
+        err = check_pattern(buf, 0xFF, len);
+        if (err == 0) {
+                ubi_err("flash region at PEB %d:%d, length %d does not "
+                        "contain all 0xFF bytes", pnum, offset, len);
+                goto fail;
+        }
+        kfree(buf);
+        return 0;
+fail:
+        ubi_err("paranoid check failed for PEB %d", pnum);
+        dbg_msg("hex dump of the %d-%d region", offset, offset + len);
+        ubi_dbg_hexdump(buf, len);
+        err = 1;
+error:
+        ubi_dbg_dump_stack();
+        kfree(buf);
+        return err;
+}
+#endif /* CONFIG_MTD_UBI_DEBUG_PARANOID */
diff --git a/drivers/mtd/ubi/kapi.c b/drivers/mtd/ubi/kapi.c
new file mode 100644
index 000000000000..d352c4575c3d
--- /dev/null
+++ b/drivers/mtd/ubi/kapi.c
@@ -0,0 +1,575 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Artem Bityutskiy (Битюцкий Артём)
+ */
+/* This file mostly implements UBI kernel API functions */
+#include <linux/module.h>
+#include <linux/err.h>
+#include <asm/div64.h>
+#include "ubi.h"
+/**
+ * ubi_get_device_info - get information about UBI device.
+ * @ubi_num: UBI device number
+ * @di: the information is stored here
+ *
+ * This function returns %0 in case of success and a %-ENODEV if there is no
+ * such UBI device.
+ */
+int ubi_get_device_info(int ubi_num, struct ubi_device_info *di)
+{
+        const struct ubi_device *ubi;
+        if (!try_module_get(THIS_MODULE))
+                return -ENODEV;
+        if (ubi_num < 0 || ubi_num >= UBI_MAX_DEVICES ||
+            !ubi_devices[ubi_num]) {
+                module_put(THIS_MODULE);
+                return -ENODEV;
+        }
+        ubi = ubi_devices[ubi_num];
+        di->ubi_num = ubi->ubi_num;
+        di->leb_size = ubi->leb_size;
+        di->min_io_size = ubi->min_io_size;
+        di->ro_mode = ubi->ro_mode;
+        di->cdev = MKDEV(ubi->major, 0);
+        module_put(THIS_MODULE);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(ubi_get_device_info);
+/**
+ * ubi_get_volume_info - get information about UBI volume.
+ * @desc: volume descriptor
+ * @vi: the information is stored here
+ */
+void ubi_get_volume_info(struct ubi_volume_desc *desc,
+                         struct ubi_volume_info *vi)
+{
+        const struct ubi_volume *vol = desc->vol;
+        const struct ubi_device *ubi = vol->ubi;
+        vi->vol_id = vol->vol_id;
+        vi->ubi_num = ubi->ubi_num;
+        vi->size = vol->reserved_pebs;
+        vi->used_bytes = vol->used_bytes;
+        vi->vol_type = vol->vol_type;
+        vi->corrupted = vol->corrupted;
+        vi->upd_marker = vol->upd_marker;
+        vi->alignment = vol->alignment;
+        vi->usable_leb_size = vol->usable_leb_size;
+        vi->name_len = vol->name_len;
+        vi->name = vol->name;
+        vi->cdev = MKDEV(ubi->major, vi->vol_id + 1);
+}
+EXPORT_SYMBOL_GPL(ubi_get_volume_info);
+/**
+ * ubi_open_volume - open UBI volume.
+ * @ubi_num: UBI device number
+ * @vol_id: volume ID
+ * @mode: open mode
+ *
+ * The @mode parameter specifies if the volume should be opened in read-only
+ * mode, read-write mode, or exclusive mode. The exclusive mode guarantees that
+ * nobody else will be able to open this volume. UBI allows to have many volume
+ * readers and one writer at a time.
+ *
+ * If a static volume is being opened for the first time since boot, it will be
+ * checked by this function, which means it will be fully read and the CRC
+ * checksum of each logical eraseblock will be checked.
+ *
+ * This function returns volume descriptor in case of success and a negative
+ * error code in case of failure.
+ */
+struct ubi_volume_desc *ubi_open_volume(int ubi_num, int vol_id, int mode)
+{
+        int err;
+        struct ubi_volume_desc *desc;
+        struct ubi_device *ubi = ubi_devices[ubi_num];
+        struct ubi_volume *vol;
+        dbg_msg("open device %d volume %d, mode %d", ubi_num, vol_id, mode);
+        err = -ENODEV;
+        if (!try_module_get(THIS_MODULE))
+                return ERR_PTR(err);
+        if (ubi_num < 0 || ubi_num >= UBI_MAX_DEVICES || !ubi)
+                goto out_put;
+        err = -EINVAL;
+        if (vol_id < 0 || vol_id >= ubi->vtbl_slots)
+                goto out_put;
+        if (mode != UBI_READONLY && mode != UBI_READWRITE &&
+            mode != UBI_EXCLUSIVE)
+                goto out_put;
+        desc = kmalloc(sizeof(struct ubi_volume_desc), GFP_KERNEL);
+        if (!desc) {
+                err = -ENOMEM;
+                goto out_put;
+        }
+        spin_lock(&ubi->volumes_lock);
+        vol = ubi->volumes[vol_id];
+        if (!vol) {
+                err = -ENODEV;
+                goto out_unlock;
+        }
+        err = -EBUSY;
+        switch (mode) {
+        case UBI_READONLY:
+                if (vol->exclusive)
+                        goto out_unlock;
+                vol->readers += 1;
+                break;
+        case UBI_READWRITE:
+                if (vol->exclusive || vol->writers > 0)
+                        goto out_unlock;
+                vol->writers += 1;
+                break;
+        case UBI_EXCLUSIVE:
+                if (vol->exclusive || vol->writers || vol->readers)
+                        goto out_unlock;
+                vol->exclusive = 1;
+                break;
+        }
+        spin_unlock(&ubi->volumes_lock);
+        desc->vol = vol;
+        desc->mode = mode;
+        /*
+         * To prevent simultaneous checks of the same volume we use @vtbl_mutex,
+         * although it is not the purpose it was introduced for.
+         */
+        mutex_lock(&ubi->vtbl_mutex);
+        if (!vol->checked) {
+                /* This is the first open - check the volume */
+                err = ubi_check_volume(ubi, vol_id);
+                if (err < 0) {
+                        mutex_unlock(&ubi->vtbl_mutex);
+                        ubi_close_volume(desc);
+                        return ERR_PTR(err);
+                }
+                if (err == 1) {
+                        ubi_warn("volume %d on UBI device %d is corrupted",
+                                 vol_id, ubi->ubi_num);
+                        vol->corrupted = 1;
+                }
+                vol->checked = 1;
+        }
+        mutex_unlock(&ubi->vtbl_mutex);
+        return desc;
+out_unlock:
+        spin_unlock(&ubi->volumes_lock);
+        kfree(desc);
+out_put:
+        module_put(THIS_MODULE);
+        return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(ubi_open_volume);
+/**
+ * ubi_open_volume_nm - open UBI volume by name.
+ * @ubi_num: UBI device number
+ * @name: volume name
+ * @mode: open mode
+ *
+ * This function is similar to 'ubi_open_volume()', but opens a volume by name.
+ */
+struct ubi_volume_desc *ubi_open_volume_nm(int ubi_num, const char *name,
+                                           int mode)
+{
+        int i, vol_id = -1, len;
+        struct ubi_volume_desc *ret;
+        struct ubi_device *ubi;
+        dbg_msg("open volume %s, mode %d", name, mode);
+        if (!name)
+                return ERR_PTR(-EINVAL);
+        len = strnlen(name, UBI_VOL_NAME_MAX + 1);
+        if (len > UBI_VOL_NAME_MAX)
+                return ERR_PTR(-EINVAL);
+        ret = ERR_PTR(-ENODEV);
+        if (!try_module_get(THIS_MODULE))
+                return ret;
+        if (ubi_num < 0 || ubi_num >= UBI_MAX_DEVICES || !ubi_devices[ubi_num])
+                goto out_put;
+        ubi = ubi_devices[ubi_num];
+        spin_lock(&ubi->volumes_lock);
+        /* Walk all volumes of this UBI device */
+        for (i = 0; i < ubi->vtbl_slots; i++) {
+                struct ubi_volume *vol = ubi->volumes[i];
+                if (vol && len == vol->name_len && !strcmp(name, vol->name)) {
+                        vol_id = i;
+                        break;
+                }
+        }
+        spin_unlock(&ubi->volumes_lock);
+        if (vol_id < 0)
+                goto out_put;
+        ret = ubi_open_volume(ubi_num, vol_id, mode);
+out_put:
+        module_put(THIS_MODULE);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(ubi_open_volume_nm);
+/**
+ * ubi_close_volume - close UBI volume.
+ * @desc: volume descriptor
+ */
+void ubi_close_volume(struct ubi_volume_desc *desc)
+{
+        struct ubi_volume *vol = desc->vol;
+        dbg_msg("close volume %d, mode %d", vol->vol_id, desc->mode);
+        spin_lock(&vol->ubi->volumes_lock);
+        switch (desc->mode) {
+        case UBI_READONLY:
+                vol->readers -= 1;
+                break;
+        case UBI_READWRITE:
+                vol->writers -= 1;
+                break;
+        case UBI_EXCLUSIVE:
+                vol->exclusive = 0;
+        }
+        spin_unlock(&vol->ubi->volumes_lock);
+        kfree(desc);
+        module_put(THIS_MODULE);
+}
+EXPORT_SYMBOL_GPL(ubi_close_volume);
+/**
+ * ubi_leb_read - read data.
+ * @desc: volume descriptor
+ * @lnum: logical eraseblock number to read from
+ * @buf: buffer where to store the read data
+ * @offset: offset within the logical eraseblock to read from
+ * @len: how many bytes to read
+ * @check: whether UBI has to check the read data's CRC or not.
+ *
+ * This function reads data from offset @offset of logical eraseblock @lnum and
+ * stores the data at @buf. When reading from static volumes, @check specifies
+ * whether the data has to be checked or not. If yes, the whole logical
+ * eraseblock will be read and its CRC checksum will be checked (i.e., the CRC
+ * checksum is per-eraseblock). So checking may substantially slow down the
+ * read speed. The @check argument is ignored for dynamic volumes.
+ *
+ * In case of success, this function returns zero. In case of failure, this
+ * function returns a negative error code.
+ *
+ * %-EBADMSG error code is returned:
+ * o for both static and dynamic volumes if MTD driver has detected a data
+ *   integrity problem (unrecoverable ECC checksum mismatch in case of NAND);
+ * o for static volumes in case of data CRC mismatch.
+ *
+ * If the volume is damaged because of an interrupted update this function just
+ * returns immediately with %-EBADF error code.
+ */
+int ubi_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
+                 int len, int check)
+{
+        struct ubi_volume *vol = desc->vol;
+        struct ubi_device *ubi = vol->ubi;
+        int err, vol_id = vol->vol_id;
+        dbg_msg("read %d bytes from LEB %d:%d:%d", len, vol_id, lnum, offset);
+        if (vol_id < 0 || vol_id >= ubi->vtbl_slots || lnum < 0 ||
+            lnum >= vol->used_ebs || offset < 0 || len < 0 ||
+            offset + len > vol->usable_leb_size)
+                return -EINVAL;
+        if (vol->vol_type == UBI_STATIC_VOLUME && lnum == vol->used_ebs - 1 &&
+            offset + len > vol->last_eb_bytes)
+                return -EINVAL;
+        if (vol->upd_marker)
+                return -EBADF;
+        if (len == 0)
+                return 0;
+        err = ubi_eba_read_leb(ubi, vol_id, lnum, buf, offset, len, check);
+        if (err && err == -EBADMSG && vol->vol_type == UBI_STATIC_VOLUME) {
+                ubi_warn("mark volume %d as corrupted", vol_id);
+                vol->corrupted = 1;
+        }
+        return err;
+}
+EXPORT_SYMBOL_GPL(ubi_leb_read);
+/**
+ * ubi_leb_write - write data.
+ * @desc: volume descriptor
+ * @lnum: logical eraseblock number to write to
+ * @buf: data to write
+ * @offset: offset within the logical eraseblock where to write
+ * @len: how many bytes to write
+ * @dtype: expected data type
+ *
+ * This function writes @len bytes of data from @buf to offset @offset of
+ * logical eraseblock @lnum. The @dtype argument describes expected lifetime of
+ * the data.
+ *
+ * This function takes care of physical eraseblock write failures. If write to
+ * the physical eraseblock write operation fails, the logical eraseblock is
+ * re-mapped to another physical eraseblock, the data is recovered, and the
+ * write finishes. UBI has a pool of reserved physical eraseblocks for this.
+ *
+ * If all the data were successfully written, zero is returned. If an error
+ * occurred and UBI has not been able to recover from it, this function returns
+ * a negative error code. Note, in case of an error, it is possible that
+ * something was still written to the flash media, but that may be some
+ * garbage.
+ *
+ * If the volume is damaged because of an interrupted update this function just
+ * returns immediately with %-EBADF code.
+ */
+int ubi_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
+                  int offset, int len, int dtype)
+{
+        struct ubi_volume *vol = desc->vol;
+        struct ubi_device *ubi = vol->ubi;
+        int vol_id = vol->vol_id;
+        dbg_msg("write %d bytes to LEB %d:%d:%d", len, vol_id, lnum, offset);
+        if (vol_id < 0 || vol_id >= ubi->vtbl_slots)
+                return -EINVAL;
+        if (desc->mode == UBI_READONLY || vol->vol_type == UBI_STATIC_VOLUME)
+                return -EROFS;
+        if (lnum < 0 || lnum >= vol->reserved_pebs || offset < 0 || len < 0 ||
+            offset + len > vol->usable_leb_size || offset % ubi->min_io_size ||
+            len % ubi->min_io_size)
+                return -EINVAL;
+        if (dtype != UBI_LONGTERM && dtype != UBI_SHORTTERM &&
+            dtype != UBI_UNKNOWN)
+                return -EINVAL;
+        if (vol->upd_marker)
+                return -EBADF;
+        if (len == 0)
+                return 0;
+        return ubi_eba_write_leb(ubi, vol_id, lnum, buf, offset, len, dtype);
+}
+EXPORT_SYMBOL_GPL(ubi_leb_write);
+/*
+ * ubi_leb_change - change logical eraseblock atomically.
+ * @desc: volume descriptor
+ * @lnum: logical eraseblock number to change
+ * @buf: data to write
+ * @len: how many bytes to write
+ * @dtype: expected data type
+ *
+ * This function changes the contents of a logical eraseblock atomically. @buf
+ * has to contain new logical eraseblock data, and @len - the length of the
+ * data, which has to be aligned. The length may be shorter then the logical
+ * eraseblock size, ant the logical eraseblock may be appended to more times
+ * later on. This function guarantees that in case of an unclean reboot the old
+ * contents is preserved. Returns zero in case of success and a negative error
+ * code in case of failure.
+ */
+int ubi_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf,
+                   int len, int dtype)
+{
+        struct ubi_volume *vol = desc->vol;
+        struct ubi_device *ubi = vol->ubi;
+        int vol_id = vol->vol_id;
+        dbg_msg("atomically write %d bytes to LEB %d:%d", len, vol_id, lnum);
+        if (vol_id < 0 || vol_id >= ubi->vtbl_slots)
+                return -EINVAL;
+        if (desc->mode == UBI_READONLY || vol->vol_type == UBI_STATIC_VOLUME)
+                return -EROFS;
+        if (lnum < 0 || lnum >= vol->reserved_pebs || len < 0 ||
+            len > vol->usable_leb_size || len % ubi->min_io_size)
+                return -EINVAL;
+        if (dtype != UBI_LONGTERM && dtype != UBI_SHORTTERM &&
+            dtype != UBI_UNKNOWN)
+                return -EINVAL;
+        if (vol->upd_marker)
+                return -EBADF;
+        if (len == 0)
+                return 0;
+        return ubi_eba_atomic_leb_change(ubi, vol_id, lnum, buf, len, dtype);
+}
+EXPORT_SYMBOL_GPL(ubi_leb_change);
+/**
+ * ubi_leb_erase - erase logical eraseblock.
+ * @desc: volume descriptor
+ * @lnum: logical eraseblock number
+ *
+ * This function un-maps logical eraseblock @lnum and synchronously erases the
+ * correspondent physical eraseblock. Returns zero in case of success and a
+ * negative error code in case of failure.
+ *
+ * If the volume is damaged because of an interrupted update this function just
+ * returns immediately with %-EBADF code.
+ */
+int ubi_leb_erase(struct ubi_volume_desc *desc, int lnum)
+{
+        struct ubi_volume *vol = desc->vol;
+        struct ubi_device *ubi = vol->ubi;
+        int err, vol_id = vol->vol_id;
+        dbg_msg("erase LEB %d:%d", vol_id, lnum);
+        if (desc->mode == UBI_READONLY || vol->vol_type == UBI_STATIC_VOLUME)
+                return -EROFS;
+        if (lnum < 0 || lnum >= vol->reserved_pebs)
+                return -EINVAL;
+        if (vol->upd_marker)
+                return -EBADF;
+        err = ubi_eba_unmap_leb(ubi, vol_id, lnum);
+        if (err)
+                return err;
+        return ubi_wl_flush(ubi);
+}
+EXPORT_SYMBOL_GPL(ubi_leb_erase);
+/**
+ * ubi_leb_unmap - un-map logical eraseblock.
+ * @desc: volume descriptor
+ * @lnum: logical eraseblock number
+ *
+ * This function un-maps logical eraseblock @lnum and schedules the
+ * corresponding physical eraseblock for erasure, so that it will eventually be
+ * physically erased in background. This operation is much faster then the
+ * erase operation.
+ *
+ * Unlike erase, the un-map operation does not guarantee that the logical
+ * eraseblock will contain all 0xFF bytes when UBI is initialized again. For
+ * example, if several logical eraseblocks are un-mapped, and an unclean reboot
+ * happens after this, the logical eraseblocks will not necessarily be
+ * un-mapped again when this MTD device is attached. They may actually be
+ * mapped to the same physical eraseblocks again. So, this function has to be
+ * used with care.
+ *
+ * In other words, when un-mapping a logical eraseblock, UBI does not store
+ * any information about this on the flash media, it just marks the logical
+ * eraseblock as "un-mapped" in RAM. If UBI is detached before the physical
+ * eraseblock is physically erased, it will be mapped again to the same logical
+ * eraseblock when the MTD device is attached again.
+ *
+ * The main and obvious use-case of this function is when the contents of a
+ * logical eraseblock has to be re-written. Then it is much more efficient to
+ * first un-map it, then write new data, rather then first erase it, then write
+ * new data. Note, once new data has been written to the logical eraseblock,
+ * UBI guarantees that the old contents has gone forever. In other words, if an
+ * unclean reboot happens after the logical eraseblock has been un-mapped and
+ * then written to, it will contain the last written data.
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure. If the volume is damaged because of an interrupted update
+ * this function just returns immediately with %-EBADF code.
+ */
+int ubi_leb_unmap(struct ubi_volume_desc *desc, int lnum)
+{
+        struct ubi_volume *vol = desc->vol;
+        struct ubi_device *ubi = vol->ubi;
+        int vol_id = vol->vol_id;
+        dbg_msg("unmap LEB %d:%d", vol_id, lnum);
+        if (desc->mode == UBI_READONLY || vol->vol_type == UBI_STATIC_VOLUME)
+                return -EROFS;
+        if (lnum < 0 || lnum >= vol->reserved_pebs)
+                return -EINVAL;
+        if (vol->upd_marker)
+                return -EBADF;
+        return ubi_eba_unmap_leb(ubi, vol_id, lnum);
+}
+EXPORT_SYMBOL_GPL(ubi_leb_unmap);
+/**
+ * ubi_is_mapped - check if logical eraseblock is mapped.
+ * @desc: volume descriptor
+ * @lnum: logical eraseblock number
+ *
+ * This function checks if logical eraseblock @lnum is mapped to a physical
+ * eraseblock. If a logical eraseblock is un-mapped, this does not necessarily
+ * mean it will still be un-mapped after the UBI device is re-attached. The
+ * logical eraseblock may become mapped to the physical eraseblock it was last
+ * mapped to.
+ *
+ * This function returns %1 if the LEB is mapped, %0 if not, and a negative
+ * error code in case of failure. If the volume is damaged because of an
+ * interrupted update this function just returns immediately with %-EBADF error
+ * code.
+ */
+int ubi_is_mapped(struct ubi_volume_desc *desc, int lnum)
+{
+        struct ubi_volume *vol = desc->vol;
+        dbg_msg("test LEB %d:%d", vol->vol_id, lnum);
+        if (lnum < 0 || lnum >= vol->reserved_pebs)
+                return -EINVAL;
+        if (vol->upd_marker)
+                return -EBADF;
+        return vol->eba_tbl[lnum] >= 0;
+}
+EXPORT_SYMBOL_GPL(ubi_is_mapped);
diff --git a/drivers/mtd/ubi/misc.c b/drivers/mtd/ubi/misc.c
new file mode 100644
index 000000000000..38d4e6757dc7
--- /dev/null
+++ b/drivers/mtd/ubi/misc.c
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Artem Bityutskiy (Битюцкий Артём)
+ */
+/* Here we keep miscellaneous functions which are used all over the UBI code */
+#include "ubi.h"
+/**
+ * calc_data_len - calculate how much real data is stored in a buffer.
+ * @ubi: UBI device description object
+ * @buf: a buffer with the contents of the physical eraseblock
+ * @length: the buffer length
+ *
+ * This function calculates how much "real data" is stored in @buf and returnes
+ * the length. Continuous 0xFF bytes at the end of the buffer are not
+ * considered as "real data".
+ */
+int ubi_calc_data_len(const struct ubi_device *ubi, const void *buf,
+                      int length)
+{
+        int i;
+        ubi_assert(length % ubi->min_io_size == 0);
+        for (i = length - 1; i >= 0; i--)
+                if (((const uint8_t *)buf)[i] != 0xFF)
+                        break;
+        /* The resulting length must be aligned to the minimum flash I/O size */
+        length = ALIGN(i + 1, ubi->min_io_size);
+        return length;
+}
+/**
+ * ubi_check_volume - check the contents of a static volume.
+ * @ubi: UBI device description object
+ * @vol_id: ID of the volume to check
+ *
+ * This function checks if static volume @vol_id is corrupted by fully reading
+ * it and checking data CRC. This function returns %0 if the volume is not
+ * corrupted, %1 if it is corrupted and a negative error code in case of
+ * failure. Dynamic volumes are not checked and zero is returned immediately.
+ */
+int ubi_check_volume(struct ubi_device *ubi, int vol_id)
+{
+        void *buf;
+        int err = 0, i;
+        struct ubi_volume *vol = ubi->volumes[vol_id];
+        if (vol->vol_type != UBI_STATIC_VOLUME)
+                return 0;
+        buf = kmalloc(vol->usable_leb_size, GFP_KERNEL);
+        if (!buf)
+                return -ENOMEM;
+        for (i = 0; i < vol->used_ebs; i++) {
+                int size;
+                if (i == vol->used_ebs - 1)
+                        size = vol->last_eb_bytes;
+                else
+                        size = vol->usable_leb_size;
+                err = ubi_eba_read_leb(ubi, vol_id, i, buf, 0, size, 1);
+                if (err) {
+                        if (err == -EBADMSG)
+                                err = 1;
+                        break;
+                }
+        }
+        kfree(buf);
+        return err;
+}
+/**
+ * ubi_calculate_rsvd_pool - calculate how many PEBs must be reserved for bad
+ * eraseblock handling.
+ * @ubi: UBI device description object
+ */
+void ubi_calculate_reserved(struct ubi_device *ubi)
+{
+        ubi->beb_rsvd_level = ubi->good_peb_count/100;
+        ubi->beb_rsvd_level *= CONFIG_MTD_UBI_BEB_RESERVE;
+        if (ubi->beb_rsvd_level < MIN_RESEVED_PEBS)
+                ubi->beb_rsvd_level = MIN_RESEVED_PEBS;
+}
diff --git a/drivers/mtd/ubi/scan.c b/drivers/mtd/ubi/scan.c
new file mode 100644
index 000000000000..473f3200b868
--- /dev/null
+++ b/drivers/mtd/ubi/scan.c
@@ -0,0 +1,1368 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Artem Bityutskiy (Битюцкий Артём)
+ */
+/*
+ * UBI scanning unit.
+ *
+ * This unit is responsible for scanning the flash media, checking UBI
+ * headers and providing complete information about the UBI flash image.
+ *
+ * The scanning information is reoresented by a &struct ubi_scan_info' object.
+ * Information about found volumes is represented by &struct ubi_scan_volume
+ * objects which are kept in volume RB-tree with root at the @volumes field.
+ * The RB-tree is indexed by the volume ID.
+ *
+ * Found logical eraseblocks are represented by &struct ubi_scan_leb objects.
+ * These objects are kept in per-volume RB-trees with the root at the
+ * corresponding &struct ubi_scan_volume object. To put it differently, we keep
+ * an RB-tree of per-volume objects and each of these objects is the root of
+ * RB-tree of per-eraseblock objects.
+ *
+ * Corrupted physical eraseblocks are put to the @corr list, free physical
+ * eraseblocks are put to the @free list and the physical eraseblock to be
+ * erased are put to the @erase list.
+ */
+#include <linux/err.h>
+#include <linux/crc32.h>
+#include "ubi.h"
+#ifdef CONFIG_MTD_UBI_DEBUG_PARANOID
+static int paranoid_check_si(const struct ubi_device *ubi,
+                             struct ubi_scan_info *si);
+#else
+#define paranoid_check_si(ubi, si) 0
+#endif
+/* Temporary variables used during scanning */
+static struct ubi_ec_hdr *ech;
+static struct ubi_vid_hdr *vidh;
+int ubi_scan_add_to_list(struct ubi_scan_info *si, int pnum, int ec,
+                         struct list_head *list)
+{
+        struct ubi_scan_leb *seb;
+        if (list == &si->free)
+                dbg_bld("add to free: PEB %d, EC %d", pnum, ec);
+        else if (list == &si->erase)
+                dbg_bld("add to erase: PEB %d, EC %d", pnum, ec);
+        else if (list == &si->corr)
+                dbg_bld("add to corrupted: PEB %d, EC %d", pnum, ec);
+        else if (list == &si->alien)
+                dbg_bld("add to alien: PEB %d, EC %d", pnum, ec);
+        else
+                BUG();
+        seb = kmalloc(sizeof(struct ubi_scan_leb), GFP_KERNEL);
+        if (!seb)
+                return -ENOMEM;
+        seb->pnum = pnum;
+        seb->ec = ec;
+        list_add_tail(&seb->u.list, list);
+        return 0;
+}
+/**
+ * commit_to_mean_value - commit intermediate results to the final mean erase
+ * counter value.
+ * @si: scanning information
+ *
+ * This is a helper function which calculates partial mean erase counter mean
+ * value and adds it to the resulting mean value. As we can work only in
+ * integer arithmetic and we want to calculate the mean value of erase counter
+ * accurately, we first sum erase counter values in @si->ec_sum variable and
+ * count these components in @si->ec_count. If this temporary @si->ec_sum is
+ * going to overflow, we calculate the partial mean value
+ * (@si->ec_sum/@si->ec_count) and add it to @si->mean_ec.
+ */
+static void commit_to_mean_value(struct ubi_scan_info *si)
+{
+        si->ec_sum /= si->ec_count;
+        if (si->ec_sum % si->ec_count >= si->ec_count / 2)
+                si->mean_ec += 1;
+        si->mean_ec += si->ec_sum;
+}
+/**
+ * validate_vid_hdr - check that volume identifier header is correct and
+ * consistent.
+ * @vid_hdr: the volume identifier header to check
+ * @sv: information about the volume this logical eraseblock belongs to
+ * @pnum: physical eraseblock number the VID header came from
+ *
+ * This function checks that data stored in @vid_hdr is consistent. Returns
+ * non-zero if an inconsistency was found and zero if not.
+ *
+ * Note, UBI does sanity check of everything it reads from the flash media.
+ * Most of the checks are done in the I/O unit. Here we check that the
+ * information in the VID header is consistent to the information in other VID
+ * headers of the same volume.
+ */
+static int validate_vid_hdr(const struct ubi_vid_hdr *vid_hdr,
+                            const struct ubi_scan_volume *sv, int pnum)
+{
+        int vol_type = vid_hdr->vol_type;
+        int vol_id = ubi32_to_cpu(vid_hdr->vol_id);
+        int used_ebs = ubi32_to_cpu(vid_hdr->used_ebs);
+        int data_pad = ubi32_to_cpu(vid_hdr->data_pad);
+        if (sv->leb_count != 0) {
+                int sv_vol_type;
+                /*
+                 * This is not the first logical eraseblock belonging to this
+                 * volume. Ensure that the data in its VID header is consistent
+                 * to the data in previous logical eraseblock headers.
+                 */
+                if (vol_id != sv->vol_id) {
+                        dbg_err("inconsistent vol_id");
+                        goto bad;
+                }
+                if (sv->vol_type == UBI_STATIC_VOLUME)
+                        sv_vol_type = UBI_VID_STATIC;
+                else
+                        sv_vol_type = UBI_VID_DYNAMIC;
+                if (vol_type != sv_vol_type) {
+                        dbg_err("inconsistent vol_type");
+                        goto bad;
+                }
+                if (used_ebs != sv->used_ebs) {
+                        dbg_err("inconsistent used_ebs");
+                        goto bad;
+                }
+                if (data_pad != sv->data_pad) {
+                        dbg_err("inconsistent data_pad");
+                        goto bad;
+                }
+        }
+        return 0;
+bad:
+        ubi_err("inconsistent VID header at PEB %d", pnum);
+        ubi_dbg_dump_vid_hdr(vid_hdr);
+        ubi_dbg_dump_sv(sv);
+        return -EINVAL;
+}
+/**
+ * add_volume - add volume to the scanning information.
+ * @si: scanning information
+ * @vol_id: ID of the volume to add
+ * @pnum: physical eraseblock number
+ * @vid_hdr: volume identifier header
+ *
+ * If the volume corresponding to the @vid_hdr logical eraseblock is already
+ * present in the scanning information, this function does nothing. Otherwise
+ * it adds corresponding volume to the scanning information. Returns a pointer
+ * to the scanning volume object in case of success and a negative error code
+ * in case of failure.
+ */
+static struct ubi_scan_volume *add_volume(struct ubi_scan_info *si, int vol_id,
+                                          int pnum,
+                                          const struct ubi_vid_hdr *vid_hdr)
+{
+        struct ubi_scan_volume *sv;
+        struct rb_node **p = &si->volumes.rb_node, *parent = NULL;
+        ubi_assert(vol_id == ubi32_to_cpu(vid_hdr->vol_id));
+        /* Walk the volume RB-tree to look if this volume is already present */
+        while (*p) {
+                parent = *p;
+                sv = rb_entry(parent, struct ubi_scan_volume, rb);
+                if (vol_id == sv->vol_id)
+                        return sv;
+                if (vol_id > sv->vol_id)
+                        p = &(*p)->rb_left;
+                else
+                        p = &(*p)->rb_right;
+        }
+        /* The volume is absent - add it */
+        sv = kmalloc(sizeof(struct ubi_scan_volume), GFP_KERNEL);
+        if (!sv)
+                return ERR_PTR(-ENOMEM);
+        sv->highest_lnum = sv->leb_count = 0;
+        si->max_sqnum = 0;
+        sv->vol_id = vol_id;
+        sv->root = RB_ROOT;
+        sv->used_ebs = ubi32_to_cpu(vid_hdr->used_ebs);
+        sv->data_pad = ubi32_to_cpu(vid_hdr->data_pad);
+        sv->compat = vid_hdr->compat;
+        sv->vol_type = vid_hdr->vol_type == UBI_VID_DYNAMIC ? UBI_DYNAMIC_VOLUME
+                                                            : UBI_STATIC_VOLUME;
+        if (vol_id > si->highest_vol_id)
+                si->highest_vol_id = vol_id;
+        rb_link_node(&sv->rb, parent, p);
+        rb_insert_color(&sv->rb, &si->volumes);
+        si->vols_found += 1;
+        dbg_bld("added volume %d", vol_id);
+        return sv;
+}
+/**
+ * compare_lebs - find out which logical eraseblock is newer.
+ * @ubi: UBI device description object
+ * @seb: first logical eraseblock to compare
+ * @pnum: physical eraseblock number of the second logical eraseblock to
+ * compare
+ * @vid_hdr: volume identifier header of the second logical eraseblock
+ *
+ * This function compares 2 copies of a LEB and informs which one is newer. In
+ * case of success this function returns a positive value, in case of failure, a
+ * negative error code is returned. The success return codes use the following
+ * bits:
+ *     o bit 0 is cleared: the first PEB (described by @seb) is newer then the
+ *       second PEB (described by @pnum and @vid_hdr);
+ *     o bit 0 is set: the second PEB is newer;
+ *     o bit 1 is cleared: no bit-flips were detected in the newer LEB;
+ *     o bit 1 is set: bit-flips were detected in the newer LEB;
+ *     o bit 2 is cleared: the older LEB is not corrupted;
+ *     o bit 2 is set: the older LEB is corrupted.
+ */
+static int compare_lebs(const struct ubi_device *ubi,
+                        const struct ubi_scan_leb *seb, int pnum,
+                        const struct ubi_vid_hdr *vid_hdr)
+{
+        void *buf;
+        int len, err, second_is_newer, bitflips = 0, corrupted = 0;
+        uint32_t data_crc, crc;
+        struct ubi_vid_hdr *vidh = NULL;
+        unsigned long long sqnum2 = ubi64_to_cpu(vid_hdr->sqnum);
+        if (seb->sqnum == 0 && sqnum2 == 0) {
+                long long abs, v1 = seb->leb_ver, v2 = ubi32_to_cpu(vid_hdr->leb_ver);
+                /*
+                 * UBI constantly increases the logical eraseblock version
+                 * number and it can overflow. Thus, we have to bear in mind
+                 * that versions that are close to %0xFFFFFFFF are less then
+                 * versions that are close to %0.
+                 *
+                 * The UBI WL unit guarantees that the number of pending tasks
+                 * is not greater then %0x7FFFFFFF. So, if the difference
+                 * between any two versions is greater or equivalent to
+                 * %0x7FFFFFFF, there was an overflow and the logical
+                 * eraseblock with lower version is actually newer then the one
+                 * with higher version.
+                 *
+                 * FIXME: but this is anyway obsolete and will be removed at
+                 * some point.
+                 */
+                dbg_bld("using old crappy leb_ver stuff");
+                abs = v1 - v2;
+                if (abs < 0)
+                        abs = -abs;
+                if (abs < 0x7FFFFFFF)
+                        /* Non-overflow situation */
+                        second_is_newer = (v2 > v1);
+                else
+                        second_is_newer = (v2 < v1);
+        } else
+                /* Obviously the LEB with lower sequence counter is older */
+                second_is_newer = sqnum2 > seb->sqnum;
+        /*
+         * Now we know which copy is newer. If the copy flag of the PEB with
+         * newer version is not set, then we just return, otherwise we have to
+         * check data CRC. For the second PEB we already have the VID header,
+         * for the first one - we'll need to re-read it from flash.
+         *
+         * FIXME: this may be optimized so that we wouldn't read twice.
+         */
+        if (second_is_newer) {
+                if (!vid_hdr->copy_flag) {
+                        /* It is not a copy, so it is newer */
+                        dbg_bld("second PEB %d is newer, copy_flag is unset",
+                                pnum);
+                        return 1;
+                }
+        } else {
+                pnum = seb->pnum;
+                vidh = ubi_zalloc_vid_hdr(ubi);
+                if (!vidh)
+                        return -ENOMEM;
+                err = ubi_io_read_vid_hdr(ubi, pnum, vidh, 0);
+                if (err) {
+                        if (err == UBI_IO_BITFLIPS)
+                                bitflips = 1;
+                        else {
+                                dbg_err("VID of PEB %d header is bad, but it "
+                                        "was OK earlier", pnum);
+                                if (err > 0)
+                                        err = -EIO;
+                                goto out_free_vidh;
+                        }
+                }
+                if (!vidh->copy_flag) {
+                        /* It is not a copy, so it is newer */
+                        dbg_bld("first PEB %d is newer, copy_flag is unset",
+                                pnum);
+                        err = bitflips << 1;
+                        goto out_free_vidh;
+                }
+                vid_hdr = vidh;
+        }
+        /* Read the data of the copy and check the CRC */
+        len = ubi32_to_cpu(vid_hdr->data_size);
+        buf = kmalloc(len, GFP_KERNEL);
+        if (!buf) {
+                err = -ENOMEM;
+                goto out_free_vidh;
+        }
+        err = ubi_io_read_data(ubi, buf, pnum, 0, len);
+        if (err && err != UBI_IO_BITFLIPS)
+                goto out_free_buf;
+        data_crc = ubi32_to_cpu(vid_hdr->data_crc);
+        crc = crc32(UBI_CRC32_INIT, buf, len);
+        if (crc != data_crc) {
+                dbg_bld("PEB %d CRC error: calculated %#08x, must be %#08x",
+                        pnum, crc, data_crc);
+                corrupted = 1;
+                bitflips = 0;
+                second_is_newer = !second_is_newer;
+        } else {
+                dbg_bld("PEB %d CRC is OK", pnum);
+                bitflips = !!err;
+        }
+        kfree(buf);
+        ubi_free_vid_hdr(ubi, vidh);
+        if (second_is_newer)
+                dbg_bld("second PEB %d is newer, copy_flag is set", pnum);
+        else
+                dbg_bld("first PEB %d is newer, copy_flag is set", pnum);
+        return second_is_newer | (bitflips << 1) | (corrupted << 2);
+out_free_buf:
+        kfree(buf);
+out_free_vidh:
+        ubi_free_vid_hdr(ubi, vidh);
+        ubi_assert(err < 0);
+        return err;
+}
+/**
+ * ubi_scan_add_used - add information about a physical eraseblock to the
+ * scanning information.
+ * @ubi: UBI device description object
+ * @si: scanning information
+ * @pnum: the physical eraseblock number
+ * @ec: erase counter
+ * @vid_hdr: the volume identifier header
+ * @bitflips: if bit-flips were detected when this physical eraseblock was read
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+int ubi_scan_add_used(const struct ubi_device *ubi, struct ubi_scan_info *si,
+                      int pnum, int ec, const struct ubi_vid_hdr *vid_hdr,
+                      int bitflips)
+{
+        int err, vol_id, lnum;
+        uint32_t leb_ver;
+        unsigned long long sqnum;
+        struct ubi_scan_volume *sv;
+        struct ubi_scan_leb *seb;
+        struct rb_node **p, *parent = NULL;
+        vol_id = ubi32_to_cpu(vid_hdr->vol_id);
+        lnum = ubi32_to_cpu(vid_hdr->lnum);
+        sqnum = ubi64_to_cpu(vid_hdr->sqnum);
+        leb_ver = ubi32_to_cpu(vid_hdr->leb_ver);
+        dbg_bld("PEB %d, LEB %d:%d, EC %d, sqnum %llu, ver %u, bitflips %d",
+                pnum, vol_id, lnum, ec, sqnum, leb_ver, bitflips);
+        sv = add_volume(si, vol_id, pnum, vid_hdr);
+        if (IS_ERR(sv) < 0)
+                return PTR_ERR(sv);
+        /*
+         * Walk the RB-tree of logical eraseblocks of volume @vol_id to look
+         * if this is the first instance of this logical eraseblock or not.
+         */
+        p = &sv->root.rb_node;
+        while (*p) {
+                int cmp_res;
+                parent = *p;
+                seb = rb_entry(parent, struct ubi_scan_leb, u.rb);
+                if (lnum != seb->lnum) {
+                        if (lnum < seb->lnum)
+                                p = &(*p)->rb_left;
+                        else
+                                p = &(*p)->rb_right;
+                        continue;
+                }
+                /*
+                 * There is already a physical eraseblock describing the same
+                 * logical eraseblock present.
+                 */
+                dbg_bld("this LEB already exists: PEB %d, sqnum %llu, "
+                        "LEB ver %u, EC %d", seb->pnum, seb->sqnum,
+                        seb->leb_ver, seb->ec);
+                /*
+                 * Make sure that the logical eraseblocks have different
+                 * versions. Otherwise the image is bad.
+                 */
+                if (seb->leb_ver == leb_ver && leb_ver != 0) {
+                        ubi_err("two LEBs with same version %u", leb_ver);
+                        ubi_dbg_dump_seb(seb, 0);
+                        ubi_dbg_dump_vid_hdr(vid_hdr);
+                        return -EINVAL;
+                }
+                /*
+                 * Make sure that the logical eraseblocks have different
+                 * sequence numbers. Otherwise the image is bad.
+                 *
+                 * FIXME: remove 'sqnum != 0' check when leb_ver is removed.
+                 */
+                if (seb->sqnum == sqnum && sqnum != 0) {
+                        ubi_err("two LEBs with same sequence number %llu",
+                                sqnum);
+                        ubi_dbg_dump_seb(seb, 0);
+                        ubi_dbg_dump_vid_hdr(vid_hdr);
+                        return -EINVAL;
+                }
+                /*
+                 * Now we have to drop the older one and preserve the newer
+                 * one.
+                 */
+                cmp_res = compare_lebs(ubi, seb, pnum, vid_hdr);
+                if (cmp_res < 0)
+                        return cmp_res;
+                if (cmp_res & 1) {
+                        /*
+                         * This logical eraseblock is newer then the one
+                         * found earlier.
+                         */
+                        err = validate_vid_hdr(vid_hdr, sv, pnum);
+                        if (err)
+                                return err;
+                        if (cmp_res & 4)
+                                err = ubi_scan_add_to_list(si, seb->pnum,
+                                                           seb->ec, &si->corr);
+                        else
+                                err = ubi_scan_add_to_list(si, seb->pnum,
+                                                           seb->ec, &si->erase);
+                        if (err)
+                                return err;
+                        seb->ec = ec;
+                        seb->pnum = pnum;
+                        seb->scrub = ((cmp_res & 2) || bitflips);
+                        seb->sqnum = sqnum;
+                        seb->leb_ver = leb_ver;
+                        if (sv->highest_lnum == lnum)
+                                sv->last_data_size =
+                                        ubi32_to_cpu(vid_hdr->data_size);
+                        return 0;
+                } else {
+                        /*
+                         * This logical eraseblock is older then the one found
+                         * previously.
+                         */
+                        if (cmp_res & 4)
+                                return ubi_scan_add_to_list(si, pnum, ec,
+                                                            &si->corr);
+                        else
+                                return ubi_scan_add_to_list(si, pnum, ec,
+                                                            &si->erase);
+                }
+        }
+        /*
+         * We've met this logical eraseblock for the first time, add it to the
+         * scanning information.
+         */
+        err = validate_vid_hdr(vid_hdr, sv, pnum);
+        if (err)
+                return err;
+        seb = kmalloc(sizeof(struct ubi_scan_leb), GFP_KERNEL);
+        if (!seb)
+                return -ENOMEM;
+        seb->ec = ec;
+        seb->pnum = pnum;
+        seb->lnum = lnum;
+        seb->sqnum = sqnum;
+        seb->scrub = bitflips;
+        seb->leb_ver = leb_ver;
+        if (sv->highest_lnum <= lnum) {
+                sv->highest_lnum = lnum;
+                sv->last_data_size = ubi32_to_cpu(vid_hdr->data_size);
+        }
+        if (si->max_sqnum < sqnum)
+                si->max_sqnum = sqnum;
+        sv->leb_count += 1;
+        rb_link_node(&seb->u.rb, parent, p);
+        rb_insert_color(&seb->u.rb, &sv->root);
+        return 0;
+}
+/**
+ * ubi_scan_find_sv - find information about a particular volume in the
+ * scanning information.
+ * @si: scanning information
+ * @vol_id: the requested volume ID
+ *
+ * This function returns a pointer to the volume description or %NULL if there
+ * are no data about this volume in the scanning information.
+ */
+struct ubi_scan_volume *ubi_scan_find_sv(const struct ubi_scan_info *si,
+                                         int vol_id)
+{
+        struct ubi_scan_volume *sv;
+        struct rb_node *p = si->volumes.rb_node;
+        while (p) {
+                sv = rb_entry(p, struct ubi_scan_volume, rb);
+                if (vol_id == sv->vol_id)
+                        return sv;
+                if (vol_id > sv->vol_id)
+                        p = p->rb_left;
+                else
+                        p = p->rb_right;
+        }
+        return NULL;
+}
+/**
+ * ubi_scan_find_seb - find information about a particular logical
+ * eraseblock in the volume scanning information.
+ * @sv: a pointer to the volume scanning information
+ * @lnum: the requested logical eraseblock
+ *
+ * This function returns a pointer to the scanning logical eraseblock or %NULL
+ * if there are no data about it in the scanning volume information.
+ */
+struct ubi_scan_leb *ubi_scan_find_seb(const struct ubi_scan_volume *sv,
+                                       int lnum)
+{
+        struct ubi_scan_leb *seb;
+        struct rb_node *p = sv->root.rb_node;
+        while (p) {
+                seb = rb_entry(p, struct ubi_scan_leb, u.rb);
+                if (lnum == seb->lnum)
+                        return seb;
+                if (lnum > seb->lnum)
+                        p = p->rb_left;
+                else
+                        p = p->rb_right;
+        }
+        return NULL;
+}
+/**
+ * ubi_scan_rm_volume - delete scanning information about a volume.
+ * @si: scanning information
+ * @sv: the volume scanning information to delete
+ */
+void ubi_scan_rm_volume(struct ubi_scan_info *si, struct ubi_scan_volume *sv)
+{
+        struct rb_node *rb;
+        struct ubi_scan_leb *seb;
+        dbg_bld("remove scanning information about volume %d", sv->vol_id);
+        while ((rb = rb_first(&sv->root))) {
+                seb = rb_entry(rb, struct ubi_scan_leb, u.rb);
+                rb_erase(&seb->u.rb, &sv->root);
+                list_add_tail(&seb->u.list, &si->erase);
+        }
+        rb_erase(&sv->rb, &si->volumes);
+        kfree(sv);
+        si->vols_found -= 1;
+}
+/**
+ * ubi_scan_erase_peb - erase a physical eraseblock.
+ * @ubi: UBI device description object
+ * @si: scanning information
+ * @pnum: physical eraseblock number to erase;
+ * @ec: erase counter value to write (%UBI_SCAN_UNKNOWN_EC if it is unknown)
+ *
+ * This function erases physical eraseblock 'pnum', and writes the erase
+ * counter header to it. This function should only be used on UBI device
+ * initialization stages, when the EBA unit had not been yet initialized. This
+ * function returns zero in case of success and a negative error code in case
+ * of failure.
+ */
+int ubi_scan_erase_peb(const struct ubi_device *ubi,
+                       const struct ubi_scan_info *si, int pnum, int ec)
+{
+        int err;
+        struct ubi_ec_hdr *ec_hdr;
+        ec_hdr = kzalloc(ubi->ec_hdr_alsize, GFP_KERNEL);
+        if (!ec_hdr)
+                return -ENOMEM;
+        if ((long long)ec >= UBI_MAX_ERASECOUNTER) {
+                /*
+                 * Erase counter overflow. Upgrade UBI and use 64-bit
+                 * erase counters internally.
+                 */
+                ubi_err("erase counter overflow at PEB %d, EC %d", pnum, ec);
+                return -EINVAL;
+        }
+        ec_hdr->ec = cpu_to_ubi64(ec);
+        err = ubi_io_sync_erase(ubi, pnum, 0);
+        if (err < 0)
+                goto out_free;
+        err = ubi_io_write_ec_hdr(ubi, pnum, ec_hdr);
+out_free:
+        kfree(ec_hdr);
+        return err;
+}
+/**
+ * ubi_scan_get_free_peb - get a free physical eraseblock.
+ * @ubi: UBI device description object
+ * @si: scanning information
+ *
+ * This function returns a free physical eraseblock. It is supposed to be
+ * called on the UBI initialization stages when the wear-leveling unit is not
+ * initialized yet. This function picks a physical eraseblocks from one of the
+ * lists, writes the EC header if it is needed, and removes it from the list.
+ *
+ * This function returns scanning physical eraseblock information in case of
+ * success and an error code in case of failure.
+ */
+struct ubi_scan_leb *ubi_scan_get_free_peb(const struct ubi_device *ubi,
+                                           struct ubi_scan_info *si)
+{
+        int err = 0, i;
+        struct ubi_scan_leb *seb;
+        if (!list_empty(&si->free)) {
+                seb = list_entry(si->free.next, struct ubi_scan_leb, u.list);
+                list_del(&seb->u.list);
+                dbg_bld("return free PEB %d, EC %d", seb->pnum, seb->ec);
+                return seb;
+        }
+        for (i = 0; i < 2; i++) {
+                struct list_head *head;
+                struct ubi_scan_leb *tmp_seb;
+                if (i == 0)
+                        head = &si->erase;
+                else
+                        head = &si->corr;
+                /*
+                 * We try to erase the first physical eraseblock from the @head
+                 * list and pick it if we succeed, or try to erase the
+                 * next one if not. And so forth. We don't want to take care
+                 * about bad eraseblocks here - they'll be handled later.
+                 */
+                list_for_each_entry_safe(seb, tmp_seb, head, u.list) {
+                        if (seb->ec == UBI_SCAN_UNKNOWN_EC)
+                                seb->ec = si->mean_ec;
+                        err = ubi_scan_erase_peb(ubi, si, seb->pnum, seb->ec+1);
+                        if (err)
+                                continue;
+                        seb->ec += 1;
+                        list_del(&seb->u.list);
+                        dbg_bld("return PEB %d, EC %d", seb->pnum, seb->ec);
+                        return seb;
+                }
+        }
+        ubi_err("no eraseblocks found");
+        return ERR_PTR(-ENOSPC);
+}
+/**
+ * process_eb - read UBI headers, check them and add corresponding data
+ * to the scanning information.
+ * @ubi: UBI device description object
+ * @si: scanning information
+ * @pnum: the physical eraseblock number
+ *
+ * This function returns a zero if the physical eraseblock was succesfully
+ * handled and a negative error code in case of failure.
+ */
+static int process_eb(struct ubi_device *ubi, struct ubi_scan_info *si, int pnum)
+{
+        long long ec;
+        int err, bitflips = 0, vol_id, ec_corr = 0;
+        dbg_bld("scan PEB %d", pnum);
+        /* Skip bad physical eraseblocks */
+        err = ubi_io_is_bad(ubi, pnum);
+        if (err < 0)
+                return err;
+        else if (err) {
+                /*
+                 * FIXME: this is actually duty of the I/O unit to initialize
+                 * this, but MTD does not provide enough information.
+                 */
+                si->bad_peb_count += 1;
+                return 0;
+        }
+        err = ubi_io_read_ec_hdr(ubi, pnum, ech, 0);
+        if (err < 0)
+                return err;
+        else if (err == UBI_IO_BITFLIPS)
+                bitflips = 1;
+        else if (err == UBI_IO_PEB_EMPTY)
+                return ubi_scan_add_to_list(si, pnum, UBI_SCAN_UNKNOWN_EC,
+                                            &si->erase);
+        else if (err == UBI_IO_BAD_EC_HDR) {
+                /*
+                 * We have to also look at the VID header, possibly it is not
+                 * corrupted. Set %bitflips flag in order to make this PEB be
+                 * moved and EC be re-created.
+                 */
+                ec_corr = 1;
+                ec = UBI_SCAN_UNKNOWN_EC;
+                bitflips = 1;
+        }
+        si->is_empty = 0;
+        if (!ec_corr) {
+                /* Make sure UBI version is OK */
+                if (ech->version != UBI_VERSION) {
+                        ubi_err("this UBI version is %d, image version is %d",
+                                UBI_VERSION, (int)ech->version);
+                        return -EINVAL;
+                }
+                ec = ubi64_to_cpu(ech->ec);
+                if (ec > UBI_MAX_ERASECOUNTER) {
+                        /*
+                         * Erase counter overflow. The EC headers have 64 bits
+                         * reserved, but we anyway make use of only 31 bit
+                         * values, as this seems to be enough for any existing
+                         * flash. Upgrade UBI and use 64-bit erase counters
+                         * internally.
+                         */
+                        ubi_err("erase counter overflow, max is %d",
+                                UBI_MAX_ERASECOUNTER);
+                        ubi_dbg_dump_ec_hdr(ech);
+                        return -EINVAL;
+                }
+        }
+        /* OK, we've done with the EC header, let's look at the VID header */
+        err = ubi_io_read_vid_hdr(ubi, pnum, vidh, 0);
+        if (err < 0)
+                return err;
+        else if (err == UBI_IO_BITFLIPS)
+                bitflips = 1;
+        else if (err == UBI_IO_BAD_VID_HDR ||
+                 (err == UBI_IO_PEB_FREE && ec_corr)) {
+                /* VID header is corrupted */
+                err = ubi_scan_add_to_list(si, pnum, ec, &si->corr);
+                if (err)
+                        return err;
+                goto adjust_mean_ec;
+        } else if (err == UBI_IO_PEB_FREE) {
+                /* No VID header - the physical eraseblock is free */
+                err = ubi_scan_add_to_list(si, pnum, ec, &si->free);
+                if (err)
+                        return err;
+                goto adjust_mean_ec;
+        }
+        vol_id = ubi32_to_cpu(vidh->vol_id);
+        if (vol_id > UBI_MAX_VOLUMES && vol_id != UBI_LAYOUT_VOL_ID) {
+                int lnum = ubi32_to_cpu(vidh->lnum);
+                /* Unsupported internal volume */
+                switch (vidh->compat) {
+                case UBI_COMPAT_DELETE:
+                        ubi_msg("\"delete\" compatible internal volume %d:%d"
+                                " found, remove it", vol_id, lnum);
+                        err = ubi_scan_add_to_list(si, pnum, ec, &si->corr);
+                        if (err)
+                                return err;
+                        break;
+                case UBI_COMPAT_RO:
+                        ubi_msg("read-only compatible internal volume %d:%d"
+                                " found, switch to read-only mode",
+                                vol_id, lnum);
+                        ubi->ro_mode = 1;
+                        break;
+                case UBI_COMPAT_PRESERVE:
+                        ubi_msg("\"preserve\" compatible internal volume %d:%d"
+                                " found", vol_id, lnum);
+                        err = ubi_scan_add_to_list(si, pnum, ec, &si->alien);
+                        if (err)
+                                return err;
+                        si->alien_peb_count += 1;
+                        return 0;
+                case UBI_COMPAT_REJECT:
+                        ubi_err("incompatible internal volume %d:%d found",
+                                vol_id, lnum);
+                        return -EINVAL;
+                }
+        }
+        /* Both UBI headers seem to be fine */
+        err = ubi_scan_add_used(ubi, si, pnum, ec, vidh, bitflips);
+        if (err)
+                return err;
+adjust_mean_ec:
+        if (!ec_corr) {
+                if (si->ec_sum + ec < ec) {
+                        commit_to_mean_value(si);
+                        si->ec_sum = 0;
+                        si->ec_count = 0;
+                } else {
+                        si->ec_sum += ec;
+                        si->ec_count += 1;
+                }
+                if (ec > si->max_ec)
+                        si->max_ec = ec;
+                if (ec < si->min_ec)
+                        si->min_ec = ec;
+        }
+        return 0;
+}
+/**
+ * ubi_scan - scan an MTD device.
+ * @ubi: UBI device description object
+ *
+ * This function does full scanning of an MTD device and returns complete
+ * information about it. In case of failure, an error code is returned.
+ */
+struct ubi_scan_info *ubi_scan(struct ubi_device *ubi)
+{
+        int err, pnum;
+        struct rb_node *rb1, *rb2;
+        struct ubi_scan_volume *sv;
+        struct ubi_scan_leb *seb;
+        struct ubi_scan_info *si;
+        si = kzalloc(sizeof(struct ubi_scan_info), GFP_KERNEL);
+        if (!si)
+                return ERR_PTR(-ENOMEM);
+        INIT_LIST_HEAD(&si->corr);
+        INIT_LIST_HEAD(&si->free);
+        INIT_LIST_HEAD(&si->erase);
+        INIT_LIST_HEAD(&si->alien);
+        si->volumes = RB_ROOT;
+        si->is_empty = 1;
+        err = -ENOMEM;
+        ech = kzalloc(ubi->ec_hdr_alsize, GFP_KERNEL);
+        if (!ech)
+                goto out_si;
+        vidh = ubi_zalloc_vid_hdr(ubi);
+        if (!vidh)
+                goto out_ech;
+        for (pnum = 0; pnum < ubi->peb_count; pnum++) {
+                cond_resched();
+                dbg_msg("process PEB %d", pnum);
+                err = process_eb(ubi, si, pnum);
+                if (err < 0)
+                        goto out_vidh;
+        }
+        dbg_msg("scanning is finished");
+        /* Finish mean erase counter calculations */
+        if (si->ec_count)
+                commit_to_mean_value(si);
+        if (si->is_empty)
+                ubi_msg("empty MTD device detected");
+        /*
+         * In case of unknown erase counter we use the mean erase counter
+         * value.
+         */
+        ubi_rb_for_each_entry(rb1, sv, &si->volumes, rb) {
+                ubi_rb_for_each_entry(rb2, seb, &sv->root, u.rb)
+                        if (seb->ec == UBI_SCAN_UNKNOWN_EC)
+                                seb->ec = si->mean_ec;
+        }
+        list_for_each_entry(seb, &si->free, u.list) {
+                if (seb->ec == UBI_SCAN_UNKNOWN_EC)
+                        seb->ec = si->mean_ec;
+        }
+        list_for_each_entry(seb, &si->corr, u.list)
+                if (seb->ec == UBI_SCAN_UNKNOWN_EC)
+                        seb->ec = si->mean_ec;
+        list_for_each_entry(seb, &si->erase, u.list)
+                if (seb->ec == UBI_SCAN_UNKNOWN_EC)
+                        seb->ec = si->mean_ec;
+        err = paranoid_check_si(ubi, si);
+        if (err) {
+                if (err > 0)
+                        err = -EINVAL;
+                goto out_vidh;
+        }
+        ubi_free_vid_hdr(ubi, vidh);
+        kfree(ech);
+        return si;
+out_vidh:
+        ubi_free_vid_hdr(ubi, vidh);
+out_ech:
+        kfree(ech);
+out_si:
+        ubi_scan_destroy_si(si);
+        return ERR_PTR(err);
+}
+/**
+ * destroy_sv - free the scanning volume information
+ * @sv: scanning volume information
+ *
+ * This function destroys the volume RB-tree (@sv->root) and the scanning
+ * volume information.
+ */
+static void destroy_sv(struct ubi_scan_volume *sv)
+{
+        struct ubi_scan_leb *seb;
+        struct rb_node *this = sv->root.rb_node;
+        while (this) {
+                if (this->rb_left)
+                        this = this->rb_left;
+                else if (this->rb_right)
+                        this = this->rb_right;
+                else {
+                        seb = rb_entry(this, struct ubi_scan_leb, u.rb);
+                        this = rb_parent(this);
+                        if (this) {
+                                if (this->rb_left == &seb->u.rb)
+                                        this->rb_left = NULL;
+                                else
+                                        this->rb_right = NULL;
+                        }
+                        kfree(seb);
+                }
+        }
+        kfree(sv);
+}
+/**
+ * ubi_scan_destroy_si - destroy scanning information.
+ * @si: scanning information
+ */
+void ubi_scan_destroy_si(struct ubi_scan_info *si)
+{
+        struct ubi_scan_leb *seb, *seb_tmp;
+        struct ubi_scan_volume *sv;
+        struct rb_node *rb;
+        list_for_each_entry_safe(seb, seb_tmp, &si->alien, u.list) {
+                list_del(&seb->u.list);
+                kfree(seb);
+        }
+        list_for_each_entry_safe(seb, seb_tmp, &si->erase, u.list) {
+                list_del(&seb->u.list);
+                kfree(seb);
+        }
+        list_for_each_entry_safe(seb, seb_tmp, &si->corr, u.list) {
+                list_del(&seb->u.list);
+                kfree(seb);
+        }
+        list_for_each_entry_safe(seb, seb_tmp, &si->free, u.list) {
+                list_del(&seb->u.list);
+                kfree(seb);
+        }
+        /* Destroy the volume RB-tree */
+        rb = si->volumes.rb_node;
+        while (rb) {
+                if (rb->rb_left)
+                        rb = rb->rb_left;
+                else if (rb->rb_right)
+                        rb = rb->rb_right;
+                else {
+                        sv = rb_entry(rb, struct ubi_scan_volume, rb);
+                        rb = rb_parent(rb);
+                        if (rb) {
+                                if (rb->rb_left == &sv->rb)
+                                        rb->rb_left = NULL;
+                                else
+                                        rb->rb_right = NULL;
+                        }
+                        destroy_sv(sv);
+                }
+        }
+        kfree(si);
+}
+#ifdef CONFIG_MTD_UBI_DEBUG_PARANOID
+/**
+ * paranoid_check_si - check if the scanning information is correct and
+ * consistent.
+ * @ubi: UBI device description object
+ * @si: scanning information
+ *
+ * This function returns zero if the scanning information is all right, %1 if
+ * not and a negative error code if an error occurred.
+ */
+static int paranoid_check_si(const struct ubi_device *ubi,
+                             struct ubi_scan_info *si)
+{
+        int pnum, err, vols_found = 0;
+        struct rb_node *rb1, *rb2;
+        struct ubi_scan_volume *sv;
+        struct ubi_scan_leb *seb, *last_seb;
+        uint8_t *buf;
+        /*
+         * At first, check that scanning information is ok.
+         */
+        ubi_rb_for_each_entry(rb1, sv, &si->volumes, rb) {
+                int leb_count = 0;
+                cond_resched();
+                vols_found += 1;
+                if (si->is_empty) {
+                        ubi_err("bad is_empty flag");
+                        goto bad_sv;
+                }
+                if (sv->vol_id < 0 || sv->highest_lnum < 0 ||
+                    sv->leb_count < 0 || sv->vol_type < 0 || sv->used_ebs < 0 ||
+                    sv->data_pad < 0 || sv->last_data_size < 0) {
+                        ubi_err("negative values");
+                        goto bad_sv;
+                }
+                if (sv->vol_id >= UBI_MAX_VOLUMES &&
+                    sv->vol_id < UBI_INTERNAL_VOL_START) {
+                        ubi_err("bad vol_id");
+                        goto bad_sv;
+                }
+                if (sv->vol_id > si->highest_vol_id) {
+                        ubi_err("highest_vol_id is %d, but vol_id %d is there",
+                                si->highest_vol_id, sv->vol_id);
+                        goto out;
+                }
+                if (sv->vol_type != UBI_DYNAMIC_VOLUME &&
+                    sv->vol_type != UBI_STATIC_VOLUME) {
+                        ubi_err("bad vol_type");
+                        goto bad_sv;
+                }
+                if (sv->data_pad > ubi->leb_size / 2) {
+                        ubi_err("bad data_pad");
+                        goto bad_sv;
+                }
+                last_seb = NULL;
+                ubi_rb_for_each_entry(rb2, seb, &sv->root, u.rb) {
+                        cond_resched();
+                        last_seb = seb;
+                        leb_count += 1;
+                        if (seb->pnum < 0 || seb->ec < 0) {
+                                ubi_err("negative values");
+                                goto bad_seb;
+                        }
+                        if (seb->ec < si->min_ec) {
+                                ubi_err("bad si->min_ec (%d), %d found",
+                                        si->min_ec, seb->ec);
+                                goto bad_seb;
+                        }
+                        if (seb->ec > si->max_ec) {
+                                ubi_err("bad si->max_ec (%d), %d found",
+                                        si->max_ec, seb->ec);
+                                goto bad_seb;
+                        }
+                        if (seb->pnum >= ubi->peb_count) {
+                                ubi_err("too high PEB number %d, total PEBs %d",
+                                        seb->pnum, ubi->peb_count);
+                                goto bad_seb;
+                        }
+                        if (sv->vol_type == UBI_STATIC_VOLUME) {
+                                if (seb->lnum >= sv->used_ebs) {
+                                        ubi_err("bad lnum or used_ebs");
+                                        goto bad_seb;
+                                }
+                        } else {
+                                if (sv->used_ebs != 0) {
+                                        ubi_err("non-zero used_ebs");
+                                        goto bad_seb;
+                                }
+                        }
+                        if (seb->lnum > sv->highest_lnum) {
+                                ubi_err("incorrect highest_lnum or lnum");
+                                goto bad_seb;
+                        }
+                }
+                if (sv->leb_count != leb_count) {
+                        ubi_err("bad leb_count, %d objects in the tree",
+                                leb_count);
+                        goto bad_sv;
+                }
+                if (!last_seb)
+                        continue;
+                seb = last_seb;
+                if (seb->lnum != sv->highest_lnum) {
+                        ubi_err("bad highest_lnum");
+                        goto bad_seb;
+                }
+        }
+        if (vols_found != si->vols_found) {
+                ubi_err("bad si->vols_found %d, should be %d",
+                        si->vols_found, vols_found);
+                goto out;
+        }
+        /* Check that scanning information is correct */
+        ubi_rb_for_each_entry(rb1, sv, &si->volumes, rb) {
+                last_seb = NULL;
+                ubi_rb_for_each_entry(rb2, seb, &sv->root, u.rb) {
+                        int vol_type;
+                        cond_resched();
+                        last_seb = seb;
+                        err = ubi_io_read_vid_hdr(ubi, seb->pnum, vidh, 1);
+                        if (err && err != UBI_IO_BITFLIPS) {
+                                ubi_err("VID header is not OK (%d)", err);
+                                if (err > 0)
+                                        err = -EIO;
+                                return err;
+                        }
+                        vol_type = vidh->vol_type == UBI_VID_DYNAMIC ?
+                                   UBI_DYNAMIC_VOLUME : UBI_STATIC_VOLUME;
+                        if (sv->vol_type != vol_type) {
+                                ubi_err("bad vol_type");
+                                goto bad_vid_hdr;
+                        }
+                        if (seb->sqnum != ubi64_to_cpu(vidh->sqnum)) {
+                                ubi_err("bad sqnum %llu", seb->sqnum);
+                                goto bad_vid_hdr;
+                        }
+                        if (sv->vol_id != ubi32_to_cpu(vidh->vol_id)) {
+                                ubi_err("bad vol_id %d", sv->vol_id);
+                                goto bad_vid_hdr;
+                        }
+                        if (sv->compat != vidh->compat) {
+                                ubi_err("bad compat %d", vidh->compat);
+                                goto bad_vid_hdr;
+                        }
+                        if (seb->lnum != ubi32_to_cpu(vidh->lnum)) {
+                                ubi_err("bad lnum %d", seb->lnum);
+                                goto bad_vid_hdr;
+                        }
+                        if (sv->used_ebs != ubi32_to_cpu(vidh->used_ebs)) {
+                                ubi_err("bad used_ebs %d", sv->used_ebs);
+                                goto bad_vid_hdr;
+                        }
+                        if (sv->data_pad != ubi32_to_cpu(vidh->data_pad)) {
+                                ubi_err("bad data_pad %d", sv->data_pad);
+                                goto bad_vid_hdr;
+                        }
+                        if (seb->leb_ver != ubi32_to_cpu(vidh->leb_ver)) {
+                                ubi_err("bad leb_ver %u", seb->leb_ver);
+                                goto bad_vid_hdr;
+                        }
+                }
+                if (!last_seb)
+                        continue;
+                if (sv->highest_lnum != ubi32_to_cpu(vidh->lnum)) {
+                        ubi_err("bad highest_lnum %d", sv->highest_lnum);
+                        goto bad_vid_hdr;
+                }
+                if (sv->last_data_size != ubi32_to_cpu(vidh->data_size)) {
+                        ubi_err("bad last_data_size %d", sv->last_data_size);
+                        goto bad_vid_hdr;
+                }
+        }
+        /*
+         * Make sure that all the physical eraseblocks are in one of the lists
+         * or trees.
+         */
+        buf = kmalloc(ubi->peb_count, GFP_KERNEL);
+        if (!buf)
+                return -ENOMEM;
+        memset(buf, 1, ubi->peb_count);
+        for (pnum = 0; pnum < ubi->peb_count; pnum++) {
+                err = ubi_io_is_bad(ubi, pnum);
+                if (err < 0)
+                        return err;
+                else if (err)
+                        buf[pnum] = 0;
+        }
+        ubi_rb_for_each_entry(rb1, sv, &si->volumes, rb)
+                ubi_rb_for_each_entry(rb2, seb, &sv->root, u.rb)
+                        buf[seb->pnum] = 0;
+        list_for_each_entry(seb, &si->free, u.list)
+                buf[seb->pnum] = 0;
+        list_for_each_entry(seb, &si->corr, u.list)
+                buf[seb->pnum] = 0;
+        list_for_each_entry(seb, &si->erase, u.list)
+                buf[seb->pnum] = 0;
+        list_for_each_entry(seb, &si->alien, u.list)
+                buf[seb->pnum] = 0;
+        err = 0;
+        for (pnum = 0; pnum < ubi->peb_count; pnum++)
+                if (buf[pnum]) {
+                        ubi_err("PEB %d is not referred", pnum);
+                        err = 1;
+                }
+        kfree(buf);
+        if (err)
+                goto out;
+        return 0;
+bad_seb:
+        ubi_err("bad scanning information about LEB %d", seb->lnum);
+        ubi_dbg_dump_seb(seb, 0);
+        ubi_dbg_dump_sv(sv);
+        goto out;
+bad_sv:
+        ubi_err("bad scanning information about volume %d", sv->vol_id);
+        ubi_dbg_dump_sv(sv);
+        goto out;
+bad_vid_hdr:
+        ubi_err("bad scanning information about volume %d", sv->vol_id);
+        ubi_dbg_dump_sv(sv);
+        ubi_dbg_dump_vid_hdr(vidh);
+out:
+        ubi_dbg_dump_stack();
+        return 1;
+}
+#endif /* CONFIG_MTD_UBI_DEBUG_PARANOID */
diff --git a/drivers/mtd/ubi/scan.h b/drivers/mtd/ubi/scan.h
new file mode 100644
index 000000000000..3949f6192c76
--- /dev/null
+++ b/drivers/mtd/ubi/scan.h
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Artem Bityutskiy (Битюцкий Артём)
+ */
+#ifndef __UBI_SCAN_H__
+#define __UBI_SCAN_H__
+/* The erase counter value for this physical eraseblock is unknown */
+#define UBI_SCAN_UNKNOWN_EC (-1)
+/**
+ * struct ubi_scan_leb - scanning information about a physical eraseblock.
+ * @ec: erase counter (%UBI_SCAN_UNKNOWN_EC if it is unknown)
+ * @pnum: physical eraseblock number
+ * @lnum: logical eraseblock number
+ * @scrub: if this physical eraseblock needs scrubbing
+ * @sqnum: sequence number
+ * @u: unions RB-tree or @list links
+ * @u.rb: link in the per-volume RB-tree of &struct ubi_scan_leb objects
+ * @u.list: link in one of the eraseblock lists
+ * @leb_ver: logical eraseblock version (obsolete)
+ *
+ * One object of this type is allocated for each physical eraseblock during
+ * scanning.
+ */
+struct ubi_scan_leb {
+        int ec;
+        int pnum;
+        int lnum;
+        int scrub;
+        unsigned long long sqnum;
+        union {
+                struct rb_node rb;
+                struct list_head list;
+        } u;
+        uint32_t leb_ver;
+};
+/**
+ * struct ubi_scan_volume - scanning information about a volume.
+ * @vol_id: volume ID
+ * @highest_lnum: highest logical eraseblock number in this volume
+ * @leb_count: number of logical eraseblocks in this volume
+ * @vol_type: volume type
+ * @used_ebs: number of used logical eraseblocks in this volume (only for
+ * static volumes)
+ * @last_data_size: amount of data in the last logical eraseblock of this
+ * volume (always equivalent to the usable logical eraseblock size in case of
+ * dynamic volumes)
+ * @data_pad: how many bytes at the end of logical eraseblocks of this volume
+ * are not used (due to volume alignment)
+ * @compat: compatibility flags of this volume
+ * @rb: link in the volume RB-tree
+ * @root: root of the RB-tree containing all the eraseblock belonging to this
+ * volume (&struct ubi_scan_leb objects)
+ *
+ * One object of this type is allocated for each volume during scanning.
+ */
+struct ubi_scan_volume {
+        int vol_id;
+        int highest_lnum;
+        int leb_count;
+        int vol_type;
+        int used_ebs;
+        int last_data_size;
+        int data_pad;
+        int compat;
+        struct rb_node rb;
+        struct rb_root root;
+};
+/**
+ * struct ubi_scan_info - UBI scanning information.
+ * @volumes: root of the volume RB-tree
+ * @corr: list of corrupted physical eraseblocks
+ * @free: list of free physical eraseblocks
+ * @erase: list of physical eraseblocks which have to be erased
+ * @alien: list of physical eraseblocks which should not be used by UBI (e.g.,
+ * @bad_peb_count: count of bad physical eraseblocks
+ * those belonging to "preserve"-compatible internal volumes)
+ * @vols_found: number of volumes found during scanning
+ * @highest_vol_id: highest volume ID
+ * @alien_peb_count: count of physical eraseblocks in the @alien list
+ * @is_empty: flag indicating whether the MTD device is empty or not
+ * @min_ec: lowest erase counter value
+ * @max_ec: highest erase counter value
+ * @max_sqnum: highest sequence number value
+ * @mean_ec: mean erase counter value
+ * @ec_sum: a temporary variable used when calculating @mean_ec
+ * @ec_count: a temporary variable used when calculating @mean_ec
+ *
+ * This data structure contains the result of scanning and may be used by other
+ * UBI units to build final UBI data structures, further error-recovery and so
+ * on.
+ */
+struct ubi_scan_info {
+        struct rb_root volumes;
+        struct list_head corr;
+        struct list_head free;
+        struct list_head erase;
+        struct list_head alien;
+        int bad_peb_count;
+        int vols_found;
+        int highest_vol_id;
+        int alien_peb_count;
+        int is_empty;
+        int min_ec;
+        int max_ec;
+        unsigned long long max_sqnum;
+        int mean_ec;
+        int ec_sum;
+        int ec_count;
+};
+struct ubi_device;
+struct ubi_vid_hdr;
+/*
+ * ubi_scan_move_to_list - move a physical eraseblock from the volume tree to a
+ * list.
+ *
+ * @sv: volume scanning information
+ * @seb: scanning eraseblock infprmation
+ * @list: the list to move to
+ */
+static inline void ubi_scan_move_to_list(struct ubi_scan_volume *sv,
+                                         struct ubi_scan_leb *seb,
+                                         struct list_head *list)
+{
+                rb_erase(&seb->u.rb, &sv->root);
+                list_add_tail(&seb->u.list, list);
+}
+int ubi_scan_add_to_list(struct ubi_scan_info *si, int pnum, int ec,
+                         struct list_head *list);
+int ubi_scan_add_used(const struct ubi_device *ubi, struct ubi_scan_info *si,
+                      int pnum, int ec, const struct ubi_vid_hdr *vid_hdr,
+                      int bitflips);
+struct ubi_scan_volume *ubi_scan_find_sv(const struct ubi_scan_info *si,
+                                         int vol_id);
+struct ubi_scan_leb *ubi_scan_find_seb(const struct ubi_scan_volume *sv,
+                                       int lnum);
+void ubi_scan_rm_volume(struct ubi_scan_info *si, struct ubi_scan_volume *sv);
+struct ubi_scan_leb *ubi_scan_get_free_peb(const struct ubi_device *ubi,
+                                           struct ubi_scan_info *si);
+int ubi_scan_erase_peb(const struct ubi_device *ubi,
+                       const struct ubi_scan_info *si, int pnum, int ec);
+struct ubi_scan_info *ubi_scan(struct ubi_device *ubi);
+void ubi_scan_destroy_si(struct ubi_scan_info *si);
+#endif /* !__UBI_SCAN_H__ */
diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h
new file mode 100644
index 000000000000..feb647f108f0
--- /dev/null
+++ b/drivers/mtd/ubi/ubi.h
@@ -0,0 +1,535 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2006
+ * Copyright (c) Nokia Corporation, 2006, 2007
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Artem Bityutskiy (Битюцкий Артём)
+ */
+#ifndef __UBI_UBI_H__
+#define __UBI_UBI_H__
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/string.h>
+#include <linux/mtd/mtd.h>
+#include <mtd/ubi-header.h>
+#include <linux/mtd/ubi.h>
+#include "scan.h"
+#include "debug.h"
+/* Maximum number of supported UBI devices */
+#define UBI_MAX_DEVICES 32
+/* UBI name used for character devices, sysfs, etc */
+#define UBI_NAME_STR "ubi"
+/* Normal UBI messages */
+#define ubi_msg(fmt, ...) printk(KERN_NOTICE "UBI: " fmt "\n", ##__VA_ARGS__)
+/* UBI warning messages */
+#define ubi_warn(fmt, ...) printk(KERN_WARNING "UBI warning: %s: " fmt "\n", \
+                                  __FUNCTION__, ##__VA_ARGS__)
+/* UBI error messages */
+#define ubi_err(fmt, ...) printk(KERN_ERR "UBI error: %s: " fmt "\n", \
+                                 __FUNCTION__, ##__VA_ARGS__)
+/* Lowest number PEBs reserved for bad PEB handling */
+#define MIN_RESEVED_PEBS 2
+/* Background thread name pattern */
+#define UBI_BGT_NAME_PATTERN "ubi_bgt%dd"
+/* This marker in the EBA table means that the LEB is um-mapped */
+#define UBI_LEB_UNMAPPED -1
+/*
+ * In case of errors, UBI tries to repeat the operation several times before
+ * returning error. The below constant defines how many times UBI re-tries.
+ */
+#define UBI_IO_RETRIES 3
+/*
+ * Error codes returned by the I/O unit.
+ *
+ * UBI_IO_PEB_EMPTY: the physical eraseblock is empty, i.e. it contains only
+ * 0xFF bytes
+ * UBI_IO_PEB_FREE: the physical eraseblock is free, i.e. it contains only a
+ * valid erase counter header, and the rest are %0xFF bytes
+ * UBI_IO_BAD_EC_HDR: the erase counter header is corrupted (bad magic or CRC)
+ * UBI_IO_BAD_VID_HDR: the volume identifier header is corrupted (bad magic or
+ * CRC)
+ * UBI_IO_BITFLIPS: bit-flips were detected and corrected
+ */
+enum {
+        UBI_IO_PEB_EMPTY = 1,
+        UBI_IO_PEB_FREE,
+        UBI_IO_BAD_EC_HDR,
+        UBI_IO_BAD_VID_HDR,
+        UBI_IO_BITFLIPS
+};
+extern int ubi_devices_cnt;
+extern struct ubi_device *ubi_devices[];
+struct ubi_volume_desc;
+/**
+ * struct ubi_volume - UBI volume description data structure.
+ * @dev: device object to make use of the the Linux device model
+ * @cdev: character device object to create character device
+ * @ubi: reference to the UBI device description object
+ * @vol_id: volume ID
+ * @readers: number of users holding this volume in read-only mode
+ * @writers: number of users holding this volume in read-write mode
+ * @exclusive: whether somebody holds this volume in exclusive mode
+ * @removed: if the volume was removed
+ * @checked: if this static volume was checked
+ *
+ * @reserved_pebs: how many physical eraseblocks are reserved for this volume
+ * @vol_type: volume type (%UBI_DYNAMIC_VOLUME or %UBI_STATIC_VOLUME)
+ * @usable_leb_size: logical eraseblock size without padding
+ * @used_ebs: how many logical eraseblocks in this volume contain data
+ * @last_eb_bytes: how many bytes are stored in the last logical eraseblock
+ * @used_bytes: how many bytes of data this volume contains
+ * @upd_marker: non-zero if the update marker is set for this volume
+ * @corrupted: non-zero if the volume is corrupted (static volumes only)
+ * @alignment: volume alignment
+ * @data_pad: how many bytes are not used at the end of physical eraseblocks to
+ * satisfy the requested alignment
+ * @name_len: volume name length
+ * @name: volume name
+ *
+ * @updating: whether the volume is being updated
+ * @upd_ebs: how many eraseblocks are expected to be updated
+ * @upd_bytes: how many bytes are expected to be received
+ * @upd_received: how many update bytes were already received
+ * @upd_buf: update buffer which is used to collect update data
+ *
+ * @eba_tbl: EBA table of this volume (LEB->PEB mapping)
+ *
+ * @gluebi_desc: gluebi UBI volume descriptor
+ * @gluebi_refcount: reference count of the gluebi MTD device
+ * @gluebi_mtd: MTD device description object of the gluebi MTD device
+ *
+ * The @corrupted field indicates that the volume's contents is corrupted.
+ * Since UBI protects only static volumes, this field is not relevant to
+ * dynamic volumes - it is user's responsibility to assure their data
+ * integrity.
+ *
+ * The @upd_marker flag indicates that this volume is either being updated at
+ * the moment or is damaged because of an unclean reboot.
+ */
+struct ubi_volume {
+        struct device dev;
+        struct cdev cdev;
+        struct ubi_device *ubi;
+        int vol_id;
+        int readers;
+        int writers;
+        int exclusive;
+        int removed;
+        int checked;
+        int reserved_pebs;
+        int vol_type;
+        int usable_leb_size;
+        int used_ebs;
+        int last_eb_bytes;
+        long long used_bytes;
+        int upd_marker;
+        int corrupted;
+        int alignment;
+        int data_pad;
+        int name_len;
+        char name[UBI_VOL_NAME_MAX+1];
+        int updating;
+        int upd_ebs;
+        long long upd_bytes;
+        long long upd_received;
+        void *upd_buf;
+        int *eba_tbl;
+#ifdef CONFIG_MTD_UBI_GLUEBI
+        /* Gluebi-related stuff may be compiled out */
+        struct ubi_volume_desc *gluebi_desc;
+        int gluebi_refcount;
+        struct mtd_info gluebi_mtd;
+#endif
+};
+/**
+ * struct ubi_volume_desc - descriptor of the UBI volume returned when it is
+ * opened.
+ * @vol: reference to the corresponding volume description object
+ * @mode: open mode (%UBI_READONLY, %UBI_READWRITE, or %UBI_EXCLUSIVE)
+ */
+struct ubi_volume_desc {
+        struct ubi_volume *vol;
+        int mode;
+};
+struct ubi_wl_entry;
+/**
+ * struct ubi_device - UBI device description structure
+ * @dev: class device object to use the the Linux device model
+ * @cdev: character device object to create character device
+ * @ubi_num: UBI device number
+ * @ubi_name: UBI device name
+ * @major: character device major number
+ * @vol_count: number of volumes in this UBI device
+ * @volumes: volumes of this UBI device
+ * @volumes_lock: protects @volumes, @rsvd_pebs, @avail_pebs, beb_rsvd_pebs,
+ * @beb_rsvd_level, @bad_peb_count, @good_peb_count, @vol_count, @vol->readers,
+ * @vol->writers, @vol->exclusive, @vol->removed, @vol->mapping and
+ * @vol->eba_tbl.
+ *
+ * @rsvd_pebs: count of reserved physical eraseblocks
+ * @avail_pebs: count of available physical eraseblocks
+ * @beb_rsvd_pebs: how many physical eraseblocks are reserved for bad PEB
+ * handling
+ * @beb_rsvd_level: normal level of PEBs reserved for bad PEB handling
+ *
+ * @vtbl_slots: how many slots are available in the volume table
+ * @vtbl_size: size of the volume table in bytes
+ * @vtbl: in-RAM volume table copy
+ *
+ * @max_ec: current highest erase counter value
+ * @mean_ec: current mean erase counter value
+ *
+ * global_sqnum: global sequence number
+ * @ltree_lock: protects the lock tree and @global_sqnum
+ * @ltree: the lock tree
+ * @vtbl_mutex: protects on-flash volume table
+ *
+ * @used: RB-tree of used physical eraseblocks
+ * @free: RB-tree of free physical eraseblocks
+ * @scrub: RB-tree of physical eraseblocks which need scrubbing
+ * @prot: protection trees
+ * @prot.pnum: protection tree indexed by physical eraseblock numbers
+ * @prot.aec: protection tree indexed by absolute erase counter value
+ * @wl_lock: protects the @used, @free, @prot, @lookuptbl, @abs_ec, @move_from,
+ * @move_to, @move_to_put @erase_pending, @wl_scheduled, and @works
+ * fields
+ * @wl_scheduled: non-zero if the wear-leveling was scheduled
+ * @lookuptbl: a table to quickly find a &struct ubi_wl_entry object for any
+ * physical eraseblock
+ * @abs_ec: absolute erase counter
+ * @move_from: physical eraseblock from where the data is being moved
+ * @move_to: physical eraseblock where the data is being moved to
+ * @move_from_put: if the "from" PEB was put
+ * @move_to_put: if the "to" PEB was put
+ * @works: list of pending works
+ * @works_count: count of pending works
+ * @bgt_thread: background thread description object
+ * @thread_enabled: if the background thread is enabled
+ * @bgt_name: background thread name
+ *
+ * @flash_size: underlying MTD device size (in bytes)
+ * @peb_count: count of physical eraseblocks on the MTD device
+ * @peb_size: physical eraseblock size
+ * @bad_peb_count: count of bad physical eraseblocks
+ * @good_peb_count: count of good physical eraseblocks
+ * @min_io_size: minimal input/output unit size of the underlying MTD device
+ * @hdrs_min_io_size: minimal I/O unit size used for VID and EC headers
+ * @ro_mode: if the UBI device is in read-only mode
+ * @leb_size: logical eraseblock size
+ * @leb_start: starting offset of logical eraseblocks within physical
+ * eraseblocks
+ * @ec_hdr_alsize: size of the EC header aligned to @hdrs_min_io_size
+ * @vid_hdr_alsize: size of the VID header aligned to @hdrs_min_io_size
+ * @vid_hdr_offset: starting offset of the volume identifier header (might be
+ * unaligned)
+ * @vid_hdr_aloffset: starting offset of the VID header aligned to
+ * @hdrs_min_io_size
+ * @vid_hdr_shift: contains @vid_hdr_offset - @vid_hdr_aloffset
+ * @bad_allowed: whether the MTD device admits of bad physical eraseblocks or
+ * not
+ * @mtd: MTD device descriptor
+ */
+struct ubi_device {
+        struct cdev cdev;
+        struct device dev;
+        int ubi_num;
+        char ubi_name[sizeof(UBI_NAME_STR)+5];
+        int major;
+        int vol_count;
+        struct ubi_volume *volumes[UBI_MAX_VOLUMES+UBI_INT_VOL_COUNT];
+        spinlock_t volumes_lock;
+        int rsvd_pebs;
+        int avail_pebs;
+        int beb_rsvd_pebs;
+        int beb_rsvd_level;
+        int vtbl_slots;
+        int vtbl_size;
+        struct ubi_vtbl_record *vtbl;
+        struct mutex vtbl_mutex;
+        int max_ec;
+        int mean_ec;
+        /* EBA unit's stuff */
+        unsigned long long global_sqnum;
+        spinlock_t ltree_lock;
+        struct rb_root ltree;
+        /* Wear-leveling unit's stuff */
+        struct rb_root used;
+        struct rb_root free;
+        struct rb_root scrub;
+        struct {
+                struct rb_root pnum;
+                struct rb_root aec;
+        } prot;
+        spinlock_t wl_lock;
+        int wl_scheduled;
+        struct ubi_wl_entry **lookuptbl;
+        unsigned long long abs_ec;
+        struct ubi_wl_entry *move_from;
+        struct ubi_wl_entry *move_to;
+        int move_from_put;
+        int move_to_put;
+        struct list_head works;
+        int works_count;
+        struct task_struct *bgt_thread;
+        int thread_enabled;
+        char bgt_name[sizeof(UBI_BGT_NAME_PATTERN)+2];
+        /* I/O unit's stuff */
+        long long flash_size;
+        int peb_count;
+        int peb_size;
+        int bad_peb_count;
+        int good_peb_count;
+        int min_io_size;
+        int hdrs_min_io_size;
+        int ro_mode;
+        int leb_size;
+        int leb_start;
+        int ec_hdr_alsize;
+        int vid_hdr_alsize;
+        int vid_hdr_offset;
+        int vid_hdr_aloffset;
+        int vid_hdr_shift;
+        int bad_allowed;
+        struct mtd_info *mtd;
+};
+extern struct file_operations ubi_cdev_operations;
+extern struct file_operations ubi_vol_cdev_operations;
+extern struct class *ubi_class;
+/* vtbl.c */
+int ubi_change_vtbl_record(struct ubi_device *ubi, int idx,
+                           struct ubi_vtbl_record *vtbl_rec);
+int ubi_read_volume_table(struct ubi_device *ubi, struct ubi_scan_info *si);
+/* vmt.c */
+int ubi_create_volume(struct ubi_device *ubi, struct ubi_mkvol_req *req);
+int ubi_remove_volume(struct ubi_volume_desc *desc);
+int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs);
+int ubi_add_volume(struct ubi_device *ubi, int vol_id);
+void ubi_free_volume(struct ubi_device *ubi, int vol_id);
+/* upd.c */
+int ubi_start_update(struct ubi_device *ubi, int vol_id, long long bytes);
+int ubi_more_update_data(struct ubi_device *ubi, int vol_id,
+                         const void __user *buf, int count);
+/* misc.c */
+int ubi_calc_data_len(const struct ubi_device *ubi, const void *buf, int length);
+int ubi_check_volume(struct ubi_device *ubi, int vol_id);
+void ubi_calculate_reserved(struct ubi_device *ubi);
+/* gluebi.c */
+#ifdef CONFIG_MTD_UBI_GLUEBI
+int ubi_create_gluebi(struct ubi_device *ubi, struct ubi_volume *vol);
+int ubi_destroy_gluebi(struct ubi_volume *vol);
+#else
+#define ubi_create_gluebi(ubi, vol) 0
+#define ubi_destroy_gluebi(vol) 0
+#endif
+/* eba.c */
+int ubi_eba_unmap_leb(struct ubi_device *ubi, int vol_id, int lnum);
+int ubi_eba_read_leb(struct ubi_device *ubi, int vol_id, int lnum, void *buf,
+                     int offset, int len, int check);
+int ubi_eba_write_leb(struct ubi_device *ubi, int vol_id, int lnum,
+                      const void *buf, int offset, int len, int dtype);
+int ubi_eba_write_leb_st(struct ubi_device *ubi, int vol_id, int lnum,
+                         const void *buf, int len, int dtype,
+                         int used_ebs);
+int ubi_eba_atomic_leb_change(struct ubi_device *ubi, int vol_id, int lnum,
+                              const void *buf, int len, int dtype);
+int ubi_eba_copy_leb(struct ubi_device *ubi, int from, int to,
+                     struct ubi_vid_hdr *vid_hdr);
+int ubi_eba_init_scan(struct ubi_device *ubi, struct ubi_scan_info *si);
+void ubi_eba_close(const struct ubi_device *ubi);
+/* wl.c */
+int ubi_wl_get_peb(struct ubi_device *ubi, int dtype);
+int ubi_wl_put_peb(struct ubi_device *ubi, int pnum, int torture);
+int ubi_wl_flush(struct ubi_device *ubi);
+int ubi_wl_scrub_peb(struct ubi_device *ubi, int pnum);
+int ubi_wl_init_scan(struct ubi_device *ubi, struct ubi_scan_info *si);
+void ubi_wl_close(struct ubi_device *ubi);
+/* io.c */
+int ubi_io_read(const struct ubi_device *ubi, void *buf, int pnum, int offset,
+                int len);
+int ubi_io_write(const struct ubi_device *ubi, const void *buf, int pnum,
+                 int offset, int len);
+int ubi_io_sync_erase(const struct ubi_device *ubi, int pnum, int torture);
+int ubi_io_is_bad(const struct ubi_device *ubi, int pnum);
+int ubi_io_mark_bad(const struct ubi_device *ubi, int pnum);
+int ubi_io_read_ec_hdr(const struct ubi_device *ubi, int pnum,
+                       struct ubi_ec_hdr *ec_hdr, int verbose);
+int ubi_io_write_ec_hdr(const struct ubi_device *ubi, int pnum,
+                        struct ubi_ec_hdr *ec_hdr);
+int ubi_io_read_vid_hdr(const struct ubi_device *ubi, int pnum,
+                        struct ubi_vid_hdr *vid_hdr, int verbose);
+int ubi_io_write_vid_hdr(const struct ubi_device *ubi, int pnum,
+                         struct ubi_vid_hdr *vid_hdr);
+/*
+ * ubi_rb_for_each_entry - walk an RB-tree.
+ * @rb: a pointer to type 'struct rb_node' to to use as a loop counter
+ * @pos: a pointer to RB-tree entry type to use as a loop counter
+ * @root: RB-tree's root
+ * @member: the name of the 'struct rb_node' within the RB-tree entry
+ */
+#define ubi_rb_for_each_entry(rb, pos, root, member)                         \
+        for (rb = rb_first(root),                                            \
+             pos = (rb ? container_of(rb, typeof(*pos), member) : NULL);     \
+             rb;                                                             \
+             rb = rb_next(rb), pos = container_of(rb, typeof(*pos), member))
+/**
+ * ubi_zalloc_vid_hdr - allocate a volume identifier header object.
+ * @ubi: UBI device description object
+ *
+ * This function returns a pointer to the newly allocated and zero-filled
+ * volume identifier header object in case of success and %NULL in case of
+ * failure.
+ */
+static inline struct ubi_vid_hdr *ubi_zalloc_vid_hdr(const struct ubi_device *ubi)
+{
+        void *vid_hdr;
+        vid_hdr = kzalloc(ubi->vid_hdr_alsize, GFP_KERNEL);
+        if (!vid_hdr)
+                return NULL;
+        /*
+         * VID headers may be stored at un-aligned flash offsets, so we shift
+         * the pointer.
+         */
+        return vid_hdr + ubi->vid_hdr_shift;
+}
+/**
+ * ubi_free_vid_hdr - free a volume identifier header object.
+ * @ubi: UBI device description object
+ * @vid_hdr: the object to free
+ */
+static inline void ubi_free_vid_hdr(const struct ubi_device *ubi,
+                                    struct ubi_vid_hdr *vid_hdr)
+{
+        void *p = vid_hdr;
+        if (!p)
+                return;
+        kfree(p - ubi->vid_hdr_shift);
+}
+/*
+ * This function is equivalent to 'ubi_io_read()', but @offset is relative to
+ * the beginning of the logical eraseblock, not to the beginning of the
+ * physical eraseblock.
+ */
+static inline int ubi_io_read_data(const struct ubi_device *ubi, void *buf,
+                                   int pnum, int offset, int len)
+{
+        ubi_assert(offset >= 0);
+        return ubi_io_read(ubi, buf, pnum, offset + ubi->leb_start, len);
+}
+/*
+ * This function is equivalent to 'ubi_io_write()', but @offset is relative to
+ * the beginning of the logical eraseblock, not to the beginning of the
+ * physical eraseblock.
+ */
+static inline int ubi_io_write_data(const struct ubi_device *ubi, const void *buf,
+                                    int pnum, int offset, int len)
+{
+        ubi_assert(offset >= 0);
+        return ubi_io_write(ubi, buf, pnum, offset + ubi->leb_start, len);
+}
+/**
+ * ubi_ro_mode - switch to read-only mode.
+ * @ubi: UBI device description object
+ */
+static inline void ubi_ro_mode(struct ubi_device *ubi)
+{
+        ubi->ro_mode = 1;
+        ubi_warn("switch to read-only mode");
+}
+/**
+ * vol_id2idx - get table index by volume ID.
+ * @ubi: UBI device description object
+ * @vol_id: volume ID
+ */
+static inline int vol_id2idx(const struct ubi_device *ubi, int vol_id)
+{
+        if (vol_id >= UBI_INTERNAL_VOL_START)
+                return vol_id - UBI_INTERNAL_VOL_START + ubi->vtbl_slots;
+        else
+                return vol_id;
+}
+/**
+ * idx2vol_id - get volume ID by table index.
+ * @ubi: UBI device description object
+ * @idx: table index
+ */
+static inline int idx2vol_id(const struct ubi_device *ubi, int idx)
+{
+        if (idx >= ubi->vtbl_slots)
+                return idx - ubi->vtbl_slots + UBI_INTERNAL_VOL_START;
+        else
+                return idx;
+}
+#endif /* !__UBI_UBI_H__ */
diff --git a/drivers/mtd/ubi/upd.c b/drivers/mtd/ubi/upd.c
new file mode 100644
index 000000000000..8925b977e3dc
--- /dev/null
+++ b/drivers/mtd/ubi/upd.c
@@ -0,0 +1,348 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2006
+ * Copyright (c) Nokia Corporation, 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Artem Bityutskiy (Битюцкий Артём)
+ *
+ * Jan 2007: Alexander Schmidt, hacked per-volume update.
+ */
+/*
+ * This file contains implementation of the volume update functionality.
+ *
+ * The update operation is based on the per-volume update marker which is
+ * stored in the volume table. The update marker is set before the update
+ * starts, and removed after the update has been finished. So if the update was
+ * interrupted by an unclean re-boot or due to some other reasons, the update
+ * marker stays on the flash media and UBI finds it when it attaches the MTD
+ * device next time. If the update marker is set for a volume, the volume is
+ * treated as damaged and most I/O operations are prohibited. Only a new update
+ * operation is allowed.
+ *
+ * Note, in general it is possible to implement the update operation as a
+ * transaction with a roll-back capability.
+ */
+#include <linux/err.h>
+#include <asm/uaccess.h>
+#include <asm/div64.h>
+#include "ubi.h"
+/**
+ * set_update_marker - set update marker.
+ * @ubi: UBI device description object
+ * @vol_id: volume ID
+ *
+ * This function sets the update marker flag for volume @vol_id. Returns zero
+ * in case of success and a negative error code in case of failure.
+ */
+static int set_update_marker(struct ubi_device *ubi, int vol_id)
+{
+        int err;
+        struct ubi_vtbl_record vtbl_rec;
+        struct ubi_volume *vol = ubi->volumes[vol_id];
+        dbg_msg("set update marker for volume %d", vol_id);
+        if (vol->upd_marker) {
+                ubi_assert(ubi->vtbl[vol_id].upd_marker);
+                dbg_msg("already set");
+                return 0;
+        }
+        memcpy(&vtbl_rec, &ubi->vtbl[vol_id], sizeof(struct ubi_vtbl_record));
+        vtbl_rec.upd_marker = 1;
+        err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec);
+        vol->upd_marker = 1;
+        return err;
+}
+/**
+ * clear_update_marker - clear update marker.
+ * @ubi: UBI device description object
+ * @vol_id: volume ID
+ * @bytes: new data size in bytes
+ *
+ * This function clears the update marker for volume @vol_id, sets new volume
+ * data size and clears the "corrupted" flag (static volumes only). Returns
+ * zero in case of success and a negative error code in case of failure.
+ */
+static int clear_update_marker(struct ubi_device *ubi, int vol_id, long long bytes)
+{
+        int err;
+        uint64_t tmp;
+        struct ubi_vtbl_record vtbl_rec;
+        struct ubi_volume *vol = ubi->volumes[vol_id];
+        dbg_msg("clear update marker for volume %d", vol_id);
+        memcpy(&vtbl_rec, &ubi->vtbl[vol_id], sizeof(struct ubi_vtbl_record));
+        ubi_assert(vol->upd_marker && vtbl_rec.upd_marker);
+        vtbl_rec.upd_marker = 0;
+        if (vol->vol_type == UBI_STATIC_VOLUME) {
+                vol->corrupted = 0;
+                vol->used_bytes = tmp = bytes;
+                vol->last_eb_bytes = do_div(tmp, vol->usable_leb_size);
+                vol->used_ebs = tmp;
+                if (vol->last_eb_bytes)
+                        vol->used_ebs += 1;
+                else
+                        vol->last_eb_bytes = vol->usable_leb_size;
+        }
+        err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec);
+        vol->upd_marker = 0;
+        return err;
+}
+/**
+ * ubi_start_update - start volume update.
+ * @ubi: UBI device description object
+ * @vol_id: volume ID
+ * @bytes: update bytes
+ *
+ * This function starts volume update operation. If @bytes is zero, the volume
+ * is just wiped out. Returns zero in case of success and a negative error code
+ * in case of failure.
+ */
+int ubi_start_update(struct ubi_device *ubi, int vol_id, long long bytes)
+{
+        int i, err;
+        uint64_t tmp;
+        struct ubi_volume *vol = ubi->volumes[vol_id];
+        dbg_msg("start update of volume %d, %llu bytes", vol_id, bytes);
+        vol->updating = 1;
+        err = set_update_marker(ubi, vol_id);
+        if (err)
+                return err;
+        /* Before updating - wipe out the volume */
+        for (i = 0; i < vol->reserved_pebs; i++) {
+                err = ubi_eba_unmap_leb(ubi, vol_id, i);
+                if (err)
+                        return err;
+        }
+        if (bytes == 0) {
+                err = clear_update_marker(ubi, vol_id, 0);
+                if (err)
+                        return err;
+                err = ubi_wl_flush(ubi);
+                if (!err)
+                        vol->updating = 0;
+        }
+        vol->upd_buf = kmalloc(ubi->leb_size, GFP_KERNEL);
+        if (!vol->upd_buf)
+                return -ENOMEM;
+        tmp = bytes;
+        vol->upd_ebs = !!do_div(tmp, vol->usable_leb_size);
+        vol->upd_ebs += tmp;
+        vol->upd_bytes = bytes;
+        vol->upd_received = 0;
+        return 0;
+}
+/**
+ * write_leb - write update data.
+ * @ubi: UBI device description object
+ * @vol_id: volume ID
+ * @lnum: logical eraseblock number
+ * @buf: data to write
+ * @len: data size
+ * @used_ebs: how many logical eraseblocks will this volume contain (static
+ * volumes only)
+ *
+ * This function writes update data to corresponding logical eraseblock. In
+ * case of dynamic volume, this function checks if the data contains 0xFF bytes
+ * at the end. If yes, the 0xFF bytes are cut and not written. So if the whole
+ * buffer contains only 0xFF bytes, the LEB is left unmapped.
+ *
+ * The reason why we skip the trailing 0xFF bytes in case of dynamic volume is
+ * that we want to make sure that more data may be appended to the logical
+ * eraseblock in future. Indeed, writing 0xFF bytes may have side effects and
+ * this PEB won't be writable anymore. So if one writes the file-system image
+ * to the UBI volume where 0xFFs mean free space - UBI makes sure this free
+ * space is writable after the update.
+ *
+ * We do not do this for static volumes because they are read-only. But this
+ * also cannot be done because we have to store per-LEB CRC and the correct
+ * data length.
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int write_leb(struct ubi_device *ubi, int vol_id, int lnum, void *buf,
+                     int len, int used_ebs)
+{
+        int err, l;
+        struct ubi_volume *vol = ubi->volumes[vol_id];
+        if (vol->vol_type == UBI_DYNAMIC_VOLUME) {
+                l = ALIGN(len, ubi->min_io_size);
+                memset(buf + len, 0xFF, l - len);
+                l = ubi_calc_data_len(ubi, buf, l);
+                if (l == 0) {
+                        dbg_msg("all %d bytes contain 0xFF - skip", len);
+                        return 0;
+                }
+                if (len != l)
+                        dbg_msg("skip last %d bytes (0xFF)", len - l);
+                err = ubi_eba_write_leb(ubi, vol_id, lnum, buf, 0, l,
+                                        UBI_UNKNOWN);
+        } else {
+                /*
+                 * When writing static volume, and this is the last logical
+                 * eraseblock, the length (@len) does not have to be aligned to
+                 * the minimal flash I/O unit. The 'ubi_eba_write_leb_st()'
+                 * function accepts exact (unaligned) length and stores it in
+                 * the VID header. And it takes care of proper alignment by
+                 * padding the buffer. Here we just make sure the padding will
+                 * contain zeros, not random trash.
+                 */
+                memset(buf + len, 0, vol->usable_leb_size - len);
+                err = ubi_eba_write_leb_st(ubi, vol_id, lnum, buf, len,
+                                           UBI_UNKNOWN, used_ebs);
+        }
+        return err;
+}
+/**
+ * ubi_more_update_data - write more update data.
+ * @vol: volume description object
+ * @buf: write data (user-space memory buffer)
+ * @count: how much bytes to write
+ *
+ * This function writes more data to the volume which is being updated. It may
+ * be called arbitrary number of times until all of the update data arrive.
+ * This function returns %0 in case of success, number of bytes written during
+ * the last call if the whole volume update was successfully finished, and a
+ * negative error code in case of failure.
+ */
+int ubi_more_update_data(struct ubi_device *ubi, int vol_id,
+                         const void __user *buf, int count)
+{
+        uint64_t tmp;
+        struct ubi_volume *vol = ubi->volumes[vol_id];
+        int lnum, offs, err = 0, len, to_write = count;
+        dbg_msg("write %d of %lld bytes, %lld already passed",
+                count, vol->upd_bytes, vol->upd_received);
+        if (ubi->ro_mode)
+                return -EROFS;
+        tmp = vol->upd_received;
+        offs = do_div(tmp, vol->usable_leb_size);
+        lnum = tmp;
+        if (vol->upd_received + count > vol->upd_bytes)
+                to_write = count = vol->upd_bytes - vol->upd_received;
+        /*
+         * When updating volumes, we accumulate whole logical eraseblock of
+         * data and write it at once.
+         */
+        if (offs != 0) {
+                /*
+                 * This is a write to the middle of the logical eraseblock. We
+                 * copy the data to our update buffer and wait for more data or
+                 * flush it if the whole eraseblock is written or the update
+                 * is finished.
+                 */
+                len = vol->usable_leb_size - offs;
+                if (len > count)
+                        len = count;
+                err = copy_from_user(vol->upd_buf + offs, buf, len);
+                if (err)
+                        return -EFAULT;
+                if (offs + len == vol->usable_leb_size ||
+                    vol->upd_received + len == vol->upd_bytes) {
+                        int flush_len = offs + len;
+                        /*
+                         * OK, we gathered either the whole eraseblock or this
+                         * is the last chunk, it's time to flush the buffer.
+                         */
+                        ubi_assert(flush_len <= vol->usable_leb_size);
+                        err = write_leb(ubi, vol_id, lnum, vol->upd_buf,
+                                        flush_len, vol->upd_ebs);
+                        if (err)
+                                return err;
+                }
+                vol->upd_received += len;
+                count -= len;
+                buf += len;
+                lnum += 1;
+        }
+        /*
+         * If we've got more to write, let's continue. At this point we know we
+         * are starting from the beginning of an eraseblock.
+         */
+        while (count) {
+                if (count > vol->usable_leb_size)
+                        len = vol->usable_leb_size;
+                else
+                        len = count;
+                err = copy_from_user(vol->upd_buf, buf, len);
+                if (err)
+                        return -EFAULT;
+                if (len == vol->usable_leb_size ||
+                    vol->upd_received + len == vol->upd_bytes) {
+                        err = write_leb(ubi, vol_id, lnum, vol->upd_buf, len,
+                                        vol->upd_ebs);
+                        if (err)
+                                break;
+                }
+                vol->upd_received += len;
+                count -= len;
+                lnum += 1;
+                buf += len;
+        }
+        ubi_assert(vol->upd_received <= vol->upd_bytes);
+        if (vol->upd_received == vol->upd_bytes) {
+                /* The update is finished, clear the update marker */
+                err = clear_update_marker(ubi, vol_id, vol->upd_bytes);
+                if (err)
+                        return err;
+                err = ubi_wl_flush(ubi);
+                if (err == 0) {
+                        err = to_write;
+                        kfree(vol->upd_buf);
+                        vol->updating = 0;
+                }
+        }
+        return err;
+}
diff --git a/drivers/mtd/ubi/vmt.c b/drivers/mtd/ubi/vmt.c
new file mode 100644
index 000000000000..622d0d18952c
--- /dev/null
+++ b/drivers/mtd/ubi/vmt.c
@@ -0,0 +1,809 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation;  either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Artem Bityutskiy (Битюцкий Артём)
+ */
+/*
+ * This file contains implementation of volume creation, deletion, updating and
+ * resizing.
+ */
+#include <linux/err.h>
+#include <asm/div64.h>
+#include "ubi.h"
+#ifdef CONFIG_MTD_UBI_DEBUG_PARANOID
+static void paranoid_check_volumes(struct ubi_device *ubi);
+#else
+#define paranoid_check_volumes(ubi)
+#endif
+static ssize_t vol_attribute_show(struct device *dev,
+                                  struct device_attribute *attr, char *buf);
+/* Device attributes corresponding to files in '/<sysfs>/class/ubi/ubiX_Y' */
+static struct device_attribute vol_reserved_ebs =
+        __ATTR(reserved_ebs, S_IRUGO, vol_attribute_show, NULL);
+static struct device_attribute vol_type =
+        __ATTR(type, S_IRUGO, vol_attribute_show, NULL);
+static struct device_attribute vol_name =
+        __ATTR(name, S_IRUGO, vol_attribute_show, NULL);
+static struct device_attribute vol_corrupted =
+        __ATTR(corrupted, S_IRUGO, vol_attribute_show, NULL);
+static struct device_attribute vol_alignment =
+        __ATTR(alignment, S_IRUGO, vol_attribute_show, NULL);
+static struct device_attribute vol_usable_eb_size =
+        __ATTR(usable_eb_size, S_IRUGO, vol_attribute_show, NULL);
+static struct device_attribute vol_data_bytes =
+        __ATTR(data_bytes, S_IRUGO, vol_attribute_show, NULL);
+static struct device_attribute vol_upd_marker =
+        __ATTR(upd_marker, S_IRUGO, vol_attribute_show, NULL);
+/*
+ * "Show" method for files in '/<sysfs>/class/ubi/ubiX_Y/'.
+ *
+ * Consider a situation:
+ * A. process 1 opens a sysfs file related to volume Y, say
+ *    /<sysfs>/class/ubi/ubiX_Y/reserved_ebs;
+ * B. process 2 removes volume Y;
+ * C. process 1 starts reading the /<sysfs>/class/ubi/ubiX_Y/reserved_ebs file;
+ *
+ * What we want to do in a situation like that is to return error when the file
+ * is read. This is done by means of the 'removed' flag and the 'vol_lock' of
+ * the UBI volume description object.
+ */
+static ssize_t vol_attribute_show(struct device *dev,
+                                  struct device_attribute *attr, char *buf)
+{
+        int ret;
+        struct ubi_volume *vol = container_of(dev, struct ubi_volume, dev);
+        spin_lock(&vol->ubi->volumes_lock);
+        if (vol->removed) {
+                spin_unlock(&vol->ubi->volumes_lock);
+                return -ENODEV;
+        }
+        if (attr == &vol_reserved_ebs)
+                ret = sprintf(buf, "%d\n", vol->reserved_pebs);
+        else if (attr == &vol_type) {
+                const char *tp;
+                tp = vol->vol_type == UBI_DYNAMIC_VOLUME ? "dynamic" : "static";
+                ret = sprintf(buf, "%s\n", tp);
+        } else if (attr == &vol_name)
+                ret = sprintf(buf, "%s\n", vol->name);
+        else if (attr == &vol_corrupted)
+                ret = sprintf(buf, "%d\n", vol->corrupted);
+        else if (attr == &vol_alignment)
+                ret = sprintf(buf, "%d\n", vol->alignment);
+        else if (attr == &vol_usable_eb_size) {
+                ret = sprintf(buf, "%d\n", vol->usable_leb_size);
+        } else if (attr == &vol_data_bytes)
+                ret = sprintf(buf, "%lld\n", vol->used_bytes);
+        else if (attr == &vol_upd_marker)
+                ret = sprintf(buf, "%d\n", vol->upd_marker);
+        else
+                BUG();
+        spin_unlock(&vol->ubi->volumes_lock);
+        return ret;
+}
+/* Release method for volume devices */
+static void vol_release(struct device *dev)
+{
+        struct ubi_volume *vol = container_of(dev, struct ubi_volume, dev);
+        ubi_assert(vol->removed);
+        kfree(vol);
+}
+/**
+ * volume_sysfs_init - initialize sysfs for new volume.
+ * @ubi: UBI device description object
+ * @vol: volume description object
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ *
+ * Note, this function does not free allocated resources in case of failure -
+ * the caller does it. This is because this would cause release() here and the
+ * caller would oops.
+ */
+static int volume_sysfs_init(struct ubi_device *ubi, struct ubi_volume *vol)
+{
+        int err;
+        err = device_create_file(&vol->dev, &vol_reserved_ebs);
+        if (err)
+                return err;
+        err = device_create_file(&vol->dev, &vol_type);
+        if (err)
+                return err;
+        err = device_create_file(&vol->dev, &vol_name);
+        if (err)
+                return err;
+        err = device_create_file(&vol->dev, &vol_corrupted);
+        if (err)
+                return err;
+        err = device_create_file(&vol->dev, &vol_alignment);
+        if (err)
+                return err;
+        err = device_create_file(&vol->dev, &vol_usable_eb_size);
+        if (err)
+                return err;
+        err = device_create_file(&vol->dev, &vol_data_bytes);
+        if (err)
+                return err;
+        err = device_create_file(&vol->dev, &vol_upd_marker);
+        if (err)
+                return err;
+        return 0;
+}
+/**
+ * volume_sysfs_close - close sysfs for a volume.
+ * @vol: volume description object
+ */
+static void volume_sysfs_close(struct ubi_volume *vol)
+{
+        device_remove_file(&vol->dev, &vol_upd_marker);
+        device_remove_file(&vol->dev, &vol_data_bytes);
+        device_remove_file(&vol->dev, &vol_usable_eb_size);
+        device_remove_file(&vol->dev, &vol_alignment);
+        device_remove_file(&vol->dev, &vol_corrupted);
+        device_remove_file(&vol->dev, &vol_name);
+        device_remove_file(&vol->dev, &vol_type);
+        device_remove_file(&vol->dev, &vol_reserved_ebs);
+        device_unregister(&vol->dev);
+}
+/**
+ * ubi_create_volume - create volume.
+ * @ubi: UBI device description object
+ * @req: volume creation request
+ *
+ * This function creates volume described by @req. If @req->vol_id id
+ * %UBI_VOL_NUM_AUTO, this function automatically assigne ID to the new volume
+ * and saves it in @req->vol_id. Returns zero in case of success and a negative
+ * error code in case of failure.
+ */
+int ubi_create_volume(struct ubi_device *ubi, struct ubi_mkvol_req *req)
+{
+        int i, err, vol_id = req->vol_id;
+        struct ubi_volume *vol;
+        struct ubi_vtbl_record vtbl_rec;
+        uint64_t bytes;
+        if (ubi->ro_mode)
+                return -EROFS;
+        vol = kzalloc(sizeof(struct ubi_volume), GFP_KERNEL);
+        if (!vol)
+                return -ENOMEM;
+        spin_lock(&ubi->volumes_lock);
+        if (vol_id == UBI_VOL_NUM_AUTO) {
+                /* Find unused volume ID */
+                dbg_msg("search for vacant volume ID");
+                for (i = 0; i < ubi->vtbl_slots; i++)
+                        if (!ubi->volumes[i]) {
+                                vol_id = i;
+                                break;
+                        }
+                if (vol_id == UBI_VOL_NUM_AUTO) {
+                        dbg_err("out of volume IDs");
+                        err = -ENFILE;
+                        goto out_unlock;
+                }
+                req->vol_id = vol_id;
+        }
+        dbg_msg("volume ID %d, %llu bytes, type %d, name %s",
+                vol_id, (unsigned long long)req->bytes,
+                (int)req->vol_type, req->name);
+        /* Ensure that this volume does not exist */
+        err = -EEXIST;
+        if (ubi->volumes[vol_id]) {
+                dbg_err("volume %d already exists", vol_id);
+                goto out_unlock;
+        }
+        /* Ensure that the name is unique */
+        for (i = 0; i < ubi->vtbl_slots; i++)
+                if (ubi->volumes[i] &&
+                    ubi->volumes[i]->name_len == req->name_len &&
+                    strcmp(ubi->volumes[i]->name, req->name) == 0) {
+                        dbg_err("volume \"%s\" exists (ID %d)", req->name, i);
+                        goto out_unlock;
+                }
+        /* Calculate how many eraseblocks are requested */
+        vol->usable_leb_size = ubi->leb_size - ubi->leb_size % req->alignment;
+        bytes = req->bytes;
+        if (do_div(bytes, vol->usable_leb_size))
+                vol->reserved_pebs = 1;
+        vol->reserved_pebs += bytes;
+        /* Reserve physical eraseblocks */
+        if (vol->reserved_pebs > ubi->avail_pebs) {
+                dbg_err("not enough PEBs, only %d available", ubi->avail_pebs);
+                spin_unlock(&ubi->volumes_lock);
+                err = -ENOSPC;
+                goto out_unlock;
+        }
+        ubi->avail_pebs -= vol->reserved_pebs;
+        ubi->rsvd_pebs += vol->reserved_pebs;
+        vol->vol_id    = vol_id;
+        vol->alignment = req->alignment;
+        vol->data_pad  = ubi->leb_size % vol->alignment;
+        vol->vol_type  = req->vol_type;
+        vol->name_len  = req->name_len;
+        memcpy(vol->name, req->name, vol->name_len + 1);
+        vol->exclusive = 1;
+        vol->ubi = ubi;
+        ubi->volumes[vol_id] = vol;
+        spin_unlock(&ubi->volumes_lock);
+        /*
+         * Finish all pending erases because there may be some LEBs belonging
+         * to the same volume ID.
+         */
+        err = ubi_wl_flush(ubi);
+        if (err)
+                goto out_acc;
+        vol->eba_tbl = kmalloc(vol->reserved_pebs * sizeof(int), GFP_KERNEL);
+        if (!vol->eba_tbl) {
+                err = -ENOMEM;
+                goto out_acc;
+        }
+        for (i = 0; i < vol->reserved_pebs; i++)
+                vol->eba_tbl[i] = UBI_LEB_UNMAPPED;
+        if (vol->vol_type == UBI_DYNAMIC_VOLUME) {
+                vol->used_ebs = vol->reserved_pebs;
+                vol->last_eb_bytes = vol->usable_leb_size;
+                vol->used_bytes = vol->used_ebs * vol->usable_leb_size;
+        } else {
+                bytes = vol->used_bytes;
+                vol->last_eb_bytes = do_div(bytes, vol->usable_leb_size);
+                vol->used_ebs = bytes;
+                if (vol->last_eb_bytes)
+                        vol->used_ebs += 1;
+                else
+                        vol->last_eb_bytes = vol->usable_leb_size;
+        }
+        /* Register character device for the volume */
+        cdev_init(&vol->cdev, &ubi_vol_cdev_operations);
+        vol->cdev.owner = THIS_MODULE;
+        err = cdev_add(&vol->cdev, MKDEV(ubi->major, vol_id + 1), 1);
+        if (err) {
+                ubi_err("cannot add character device for volume %d", vol_id);
+                goto out_mapping;
+        }
+        err = ubi_create_gluebi(ubi, vol);
+        if (err)
+                goto out_cdev;
+        vol->dev.release = vol_release;
+        vol->dev.parent = &ubi->dev;
+        vol->dev.devt = MKDEV(ubi->major, vol->vol_id + 1);
+        vol->dev.class = ubi_class;
+        sprintf(&vol->dev.bus_id[0], "%s_%d", ubi->ubi_name, vol->vol_id);
+        err = device_register(&vol->dev);
+        if (err)
+                goto out_gluebi;
+        err = volume_sysfs_init(ubi, vol);
+        if (err)
+                goto out_sysfs;
+        /* Fill volume table record */
+        memset(&vtbl_rec, 0, sizeof(struct ubi_vtbl_record));
+        vtbl_rec.reserved_pebs = cpu_to_ubi32(vol->reserved_pebs);
+        vtbl_rec.alignment     = cpu_to_ubi32(vol->alignment);
+        vtbl_rec.data_pad      = cpu_to_ubi32(vol->data_pad);
+        vtbl_rec.name_len      = cpu_to_ubi16(vol->name_len);
+        if (vol->vol_type == UBI_DYNAMIC_VOLUME)
+                vtbl_rec.vol_type = UBI_VID_DYNAMIC;
+        else
+                vtbl_rec.vol_type = UBI_VID_STATIC;
+        memcpy(vtbl_rec.name, vol->name, vol->name_len + 1);
+        err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec);
+        if (err)
+                goto out_sysfs;
+        spin_lock(&ubi->volumes_lock);
+        ubi->vol_count += 1;
+        vol->exclusive = 0;
+        spin_unlock(&ubi->volumes_lock);
+        paranoid_check_volumes(ubi);
+        return 0;
+out_gluebi:
+        err = ubi_destroy_gluebi(vol);
+out_cdev:
+        cdev_del(&vol->cdev);
+out_mapping:
+        kfree(vol->eba_tbl);
+out_acc:
+        spin_lock(&ubi->volumes_lock);
+        ubi->rsvd_pebs -= vol->reserved_pebs;
+        ubi->avail_pebs += vol->reserved_pebs;
+out_unlock:
+        spin_unlock(&ubi->volumes_lock);
+        kfree(vol);
+        return err;
+        /*
+         * We are registered, so @vol is destroyed in the release function and
+         * we have to de-initialize differently.
+         */
+out_sysfs:
+        err = ubi_destroy_gluebi(vol);
+        cdev_del(&vol->cdev);
+        kfree(vol->eba_tbl);
+        spin_lock(&ubi->volumes_lock);
+        ubi->rsvd_pebs -= vol->reserved_pebs;
+        ubi->avail_pebs += vol->reserved_pebs;
+        spin_unlock(&ubi->volumes_lock);
+        volume_sysfs_close(vol);
+        return err;
+}
+/**
+ * ubi_remove_volume - remove volume.
+ * @desc: volume descriptor
+ *
+ * This function removes volume described by @desc. The volume has to be opened
+ * in "exclusive" mode. Returns zero in case of success and a negative error
+ * code in case of failure.
+ */
+int ubi_remove_volume(struct ubi_volume_desc *desc)
+{
+        struct ubi_volume *vol = desc->vol;
+        struct ubi_device *ubi = vol->ubi;
+        int i, err, vol_id = vol->vol_id, reserved_pebs = vol->reserved_pebs;
+        dbg_msg("remove UBI volume %d", vol_id);
+        ubi_assert(desc->mode == UBI_EXCLUSIVE);
+        ubi_assert(vol == ubi->volumes[vol_id]);
+        if (ubi->ro_mode)
+                return -EROFS;
+        err = ubi_destroy_gluebi(vol);
+        if (err)
+                return err;
+        err = ubi_change_vtbl_record(ubi, vol_id, NULL);
+        if (err)
+                return err;
+        for (i = 0; i < vol->reserved_pebs; i++) {
+                err = ubi_eba_unmap_leb(ubi, vol_id, i);
+                if (err)
+                        return err;
+        }
+        spin_lock(&ubi->volumes_lock);
+        vol->removed = 1;
+        ubi->volumes[vol_id] = NULL;
+        spin_unlock(&ubi->volumes_lock);
+        kfree(vol->eba_tbl);
+        vol->eba_tbl = NULL;
+        cdev_del(&vol->cdev);
+        volume_sysfs_close(vol);
+        kfree(desc);
+        spin_lock(&ubi->volumes_lock);
+        ubi->rsvd_pebs -= reserved_pebs;
+        ubi->avail_pebs += reserved_pebs;
+        i = ubi->beb_rsvd_level - ubi->beb_rsvd_pebs;
+        if (i > 0) {
+                i = ubi->avail_pebs >= i ? i : ubi->avail_pebs;
+                ubi->avail_pebs -= i;
+                ubi->rsvd_pebs += i;
+                ubi->beb_rsvd_pebs += i;
+                if (i > 0)
+                        ubi_msg("reserve more %d PEBs", i);
+        }
+        ubi->vol_count -= 1;
+        spin_unlock(&ubi->volumes_lock);
+        paranoid_check_volumes(ubi);
+        module_put(THIS_MODULE);
+        return 0;
+}
+/**
+ * ubi_resize_volume - re-size volume.
+ * @desc: volume descriptor
+ * @reserved_pebs: new size in physical eraseblocks
+ *
+ * This function returns zero in case of success, and a negative error code in
+ * case of failure.
+ */
+int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs)
+{
+        int i, err, pebs, *new_mapping;
+        struct ubi_volume *vol = desc->vol;
+        struct ubi_device *ubi = vol->ubi;
+        struct ubi_vtbl_record vtbl_rec;
+        int vol_id = vol->vol_id;
+        if (ubi->ro_mode)
+                return -EROFS;
+        dbg_msg("re-size volume %d to from %d to %d PEBs",
+                vol_id, vol->reserved_pebs, reserved_pebs);
+        ubi_assert(desc->mode == UBI_EXCLUSIVE);
+        ubi_assert(vol == ubi->volumes[vol_id]);
+        if (vol->vol_type == UBI_STATIC_VOLUME &&
+            reserved_pebs < vol->used_ebs) {
+                dbg_err("too small size %d, %d LEBs contain data",
+                        reserved_pebs, vol->used_ebs);
+                return -EINVAL;
+        }
+        /* If the size is the same, we have nothing to do */
+        if (reserved_pebs == vol->reserved_pebs)
+                return 0;
+        new_mapping = kmalloc(reserved_pebs * sizeof(int), GFP_KERNEL);
+        if (!new_mapping)
+                return -ENOMEM;
+        for (i = 0; i < reserved_pebs; i++)
+                new_mapping[i] = UBI_LEB_UNMAPPED;
+        /* Reserve physical eraseblocks */
+        pebs = reserved_pebs - vol->reserved_pebs;
+        if (pebs > 0) {
+                spin_lock(&ubi->volumes_lock);
+                if (pebs > ubi->avail_pebs) {
+                        dbg_err("not enough PEBs: requested %d, available %d",
+                                pebs, ubi->avail_pebs);
+                        spin_unlock(&ubi->volumes_lock);
+                        err = -ENOSPC;
+                        goto out_free;
+                }
+                ubi->avail_pebs -= pebs;
+                ubi->rsvd_pebs += pebs;
+                for (i = 0; i < vol->reserved_pebs; i++)
+                        new_mapping[i] = vol->eba_tbl[i];
+                kfree(vol->eba_tbl);
+                vol->eba_tbl = new_mapping;
+                spin_unlock(&ubi->volumes_lock);
+        }
+        /* Change volume table record */
+        memcpy(&vtbl_rec, &ubi->vtbl[vol_id], sizeof(struct ubi_vtbl_record));
+        vtbl_rec.reserved_pebs = cpu_to_ubi32(reserved_pebs);
+        err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec);
+        if (err)
+                goto out_acc;
+        if (pebs < 0) {
+                for (i = 0; i < -pebs; i++) {
+                        err = ubi_eba_unmap_leb(ubi, vol_id, reserved_pebs + i);
+                        if (err)
+                                goto out_acc;
+                }
+                spin_lock(&ubi->volumes_lock);
+                ubi->rsvd_pebs += pebs;
+                ubi->avail_pebs -= pebs;
+                pebs = ubi->beb_rsvd_level - ubi->beb_rsvd_pebs;
+                if (pebs > 0) {
+                        pebs = ubi->avail_pebs >= pebs ? pebs : ubi->avail_pebs;
+                        ubi->avail_pebs -= pebs;
+                        ubi->rsvd_pebs += pebs;
+                        ubi->beb_rsvd_pebs += pebs;
+                        if (pebs > 0)
+                                ubi_msg("reserve more %d PEBs", pebs);
+                }
+                for (i = 0; i < reserved_pebs; i++)
+                        new_mapping[i] = vol->eba_tbl[i];
+                kfree(vol->eba_tbl);
+                vol->eba_tbl = new_mapping;
+                spin_unlock(&ubi->volumes_lock);
+        }
+        vol->reserved_pebs = reserved_pebs;
+        if (vol->vol_type == UBI_DYNAMIC_VOLUME) {
+                vol->used_ebs = reserved_pebs;
+                vol->last_eb_bytes = vol->usable_leb_size;
+                vol->used_bytes = vol->used_ebs * vol->usable_leb_size;
+        }
+        paranoid_check_volumes(ubi);
+        return 0;
+out_acc:
+        if (pebs > 0) {
+                spin_lock(&ubi->volumes_lock);
+                ubi->rsvd_pebs -= pebs;
+                ubi->avail_pebs += pebs;
+                spin_unlock(&ubi->volumes_lock);
+        }
+out_free:
+        kfree(new_mapping);
+        return err;
+}
+/**
+ * ubi_add_volume - add volume.
+ * @ubi: UBI device description object
+ * @vol_id: volume ID
+ *
+ * This function adds an existin volume and initializes all its data
+ * structures. Returnes zero in case of success and a negative error code in
+ * case of failure.
+ */
+int ubi_add_volume(struct ubi_device *ubi, int vol_id)
+{
+        int err;
+        struct ubi_volume *vol = ubi->volumes[vol_id];
+        dbg_msg("add volume %d", vol_id);
+        ubi_dbg_dump_vol_info(vol);
+        ubi_assert(vol);
+        /* Register character device for the volume */
+        cdev_init(&vol->cdev, &ubi_vol_cdev_operations);
+        vol->cdev.owner = THIS_MODULE;
+        err = cdev_add(&vol->cdev, MKDEV(ubi->major, vol->vol_id + 1), 1);
+        if (err) {
+                ubi_err("cannot add character device for volume %d", vol_id);
+                return err;
+        }
+        err = ubi_create_gluebi(ubi, vol);
+        if (err)
+                goto out_cdev;
+        vol->dev.release = vol_release;
+        vol->dev.parent = &ubi->dev;
+        vol->dev.devt = MKDEV(ubi->major, vol->vol_id + 1);
+        vol->dev.class = ubi_class;
+        sprintf(&vol->dev.bus_id[0], "%s_%d", ubi->ubi_name, vol->vol_id);
+        err = device_register(&vol->dev);
+        if (err)
+                goto out_gluebi;
+        err = volume_sysfs_init(ubi, vol);
+        if (err) {
+                cdev_del(&vol->cdev);
+                err = ubi_destroy_gluebi(vol);
+                volume_sysfs_close(vol);
+                return err;
+        }
+        paranoid_check_volumes(ubi);
+        return 0;
+out_gluebi:
+        err = ubi_destroy_gluebi(vol);
+out_cdev:
+        cdev_del(&vol->cdev);
+        return err;
+}
+/**
+ * ubi_free_volume - free volume.
+ * @ubi: UBI device description object
+ * @vol_id: volume ID
+ *
+ * This function frees all resources for volume @vol_id but does not remove it.
+ * Used only when the UBI device is detached.
+ */
+void ubi_free_volume(struct ubi_device *ubi, int vol_id)
+{
+        int err;
+        struct ubi_volume *vol = ubi->volumes[vol_id];
+        dbg_msg("free volume %d", vol_id);
+        ubi_assert(vol);
+        vol->removed = 1;
+        err = ubi_destroy_gluebi(vol);
+        ubi->volumes[vol_id] = NULL;
+        cdev_del(&vol->cdev);
+        volume_sysfs_close(vol);
+}
+#ifdef CONFIG_MTD_UBI_DEBUG_PARANOID
+/**
+ * paranoid_check_volume - check volume information.
+ * @ubi: UBI device description object
+ * @vol_id: volume ID
+ */
+static void paranoid_check_volume(const struct ubi_device *ubi, int vol_id)
+{
+        int idx = vol_id2idx(ubi, vol_id);
+        int reserved_pebs, alignment, data_pad, vol_type, name_len, upd_marker;
+        const struct ubi_volume *vol = ubi->volumes[idx];
+        long long n;
+        const char *name;
+        reserved_pebs = ubi32_to_cpu(ubi->vtbl[vol_id].reserved_pebs);
+        if (!vol) {
+                if (reserved_pebs) {
+                        ubi_err("no volume info, but volume exists");
+                        goto fail;
+                }
+                return;
+        }
+        if (vol->reserved_pebs < 0 || vol->alignment < 0 || vol->data_pad < 0 ||
+            vol->name_len < 0) {
+                ubi_err("negative values");
+                goto fail;
+        }
+        if (vol->alignment > ubi->leb_size || vol->alignment == 0) {
+                ubi_err("bad alignment");
+                goto fail;
+        }
+        n = vol->alignment % ubi->min_io_size;
+        if (vol->alignment != 1 && n) {
+                ubi_err("alignment is not multiple of min I/O unit");
+                goto fail;
+        }
+        n = ubi->leb_size % vol->alignment;
+        if (vol->data_pad != n) {
+                ubi_err("bad data_pad, has to be %lld", n);
+                goto fail;
+        }
+        if (vol->vol_type != UBI_DYNAMIC_VOLUME &&
+            vol->vol_type != UBI_STATIC_VOLUME) {
+                ubi_err("bad vol_type");
+                goto fail;
+        }
+        if (vol->upd_marker != 0 && vol->upd_marker != 1) {
+                ubi_err("bad upd_marker");
+                goto fail;
+        }
+        if (vol->upd_marker && vol->corrupted) {
+                dbg_err("update marker and corrupted simultaneously");
+                goto fail;
+        }
+        if (vol->reserved_pebs > ubi->good_peb_count) {
+                ubi_err("too large reserved_pebs");
+                goto fail;
+        }
+        n = ubi->leb_size - vol->data_pad;
+        if (vol->usable_leb_size != ubi->leb_size - vol->data_pad) {
+                ubi_err("bad usable_leb_size, has to be %lld", n);
+                goto fail;
+        }
+        if (vol->name_len > UBI_VOL_NAME_MAX) {
+                ubi_err("too long volume name, max is %d", UBI_VOL_NAME_MAX);
+                goto fail;
+        }
+        if (!vol->name) {
+                ubi_err("NULL volume name");
+                goto fail;
+        }
+        n = strnlen(vol->name, vol->name_len + 1);
+        if (n != vol->name_len) {
+                ubi_err("bad name_len %lld", n);
+                goto fail;
+        }
+        n = vol->used_ebs * vol->usable_leb_size;
+        if (vol->vol_type == UBI_DYNAMIC_VOLUME) {
+                if (vol->corrupted != 0) {
+                        ubi_err("corrupted dynamic volume");
+                        goto fail;
+                }
+                if (vol->used_ebs != vol->reserved_pebs) {
+                        ubi_err("bad used_ebs");
+                        goto fail;
+                }
+                if (vol->last_eb_bytes != vol->usable_leb_size) {
+                        ubi_err("bad last_eb_bytes");
+                        goto fail;
+                }
+                if (vol->used_bytes != n) {
+                        ubi_err("bad used_bytes");
+                        goto fail;
+                }
+        } else {
+                if (vol->corrupted != 0 && vol->corrupted != 1) {
+                        ubi_err("bad corrupted");
+                        goto fail;
+                }
+                if (vol->used_ebs < 0 || vol->used_ebs > vol->reserved_pebs) {
+                        ubi_err("bad used_ebs");
+                        goto fail;
+                }
+                if (vol->last_eb_bytes < 0 ||
+                    vol->last_eb_bytes > vol->usable_leb_size) {
+                        ubi_err("bad last_eb_bytes");
+                        goto fail;
+                }
+                if (vol->used_bytes < 0 || vol->used_bytes > n ||
+                    vol->used_bytes < n - vol->usable_leb_size) {
+                        ubi_err("bad used_bytes");
+                        goto fail;
+                }
+        }
+        alignment  = ubi32_to_cpu(ubi->vtbl[vol_id].alignment);
+        data_pad   = ubi32_to_cpu(ubi->vtbl[vol_id].data_pad);
+        name_len   = ubi16_to_cpu(ubi->vtbl[vol_id].name_len);
+        upd_marker = ubi->vtbl[vol_id].upd_marker;
+        name       = &ubi->vtbl[vol_id].name[0];
+        if (ubi->vtbl[vol_id].vol_type == UBI_VID_DYNAMIC)
+                vol_type = UBI_DYNAMIC_VOLUME;
+        else
+                vol_type = UBI_STATIC_VOLUME;
+        if (alignment != vol->alignment || data_pad != vol->data_pad ||
+            upd_marker != vol->upd_marker || vol_type != vol->vol_type ||
+            name_len!= vol->name_len || strncmp(name, vol->name, name_len)) {
+                ubi_err("volume info is different");
+                goto fail;
+        }
+        return;
+fail:
+        ubi_err("paranoid check failed");
+        ubi_dbg_dump_vol_info(vol);
+        ubi_dbg_dump_vtbl_record(&ubi->vtbl[vol_id], vol_id);
+        BUG();
+}
+/**
+ * paranoid_check_volumes - check information about all volumes.
+ * @ubi: UBI device description object
+ */
+static void paranoid_check_volumes(struct ubi_device *ubi)
+{
+        int i;
+        mutex_lock(&ubi->vtbl_mutex);
+        spin_lock(&ubi->volumes_lock);
+        for (i = 0; i < ubi->vtbl_slots; i++)
+                paranoid_check_volume(ubi, i);
+        spin_unlock(&ubi->volumes_lock);
+        mutex_unlock(&ubi->vtbl_mutex);
+}
+#endif
diff --git a/drivers/mtd/ubi/vtbl.c b/drivers/mtd/ubi/vtbl.c
new file mode 100644
index 000000000000..b6fd6bbd941e
--- /dev/null
+++ b/drivers/mtd/ubi/vtbl.c
@@ -0,0 +1,809 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2006
+ * Copyright (c) Nokia Corporation, 2006, 2007
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Artem Bityutskiy (Битюцкий Артём)
+ */
+/*
+ * This file includes volume table manipulation code. The volume table is an
+ * on-flash table containing volume meta-data like name, number of reserved
+ * physical eraseblocks, type, etc. The volume table is stored in the so-called
+ * "layout volume".
+ *
+ * The layout volume is an internal volume which is organized as follows. It
+ * consists of two logical eraseblocks - LEB 0 and LEB 1. Each logical
+ * eraseblock stores one volume table copy, i.e. LEB 0 and LEB 1 duplicate each
+ * other. This redundancy guarantees robustness to unclean reboots. The volume
+ * table is basically an array of volume table records. Each record contains
+ * full information about the volume and protected by a CRC checksum.
+ *
+ * The volume table is changed, it is first changed in RAM. Then LEB 0 is
+ * erased, and the updated volume table is written back to LEB 0. Then same for
+ * LEB 1. This scheme guarantees recoverability from unclean reboots.
+ *
+ * In this UBI implementation the on-flash volume table does not contain any
+ * information about how many data static volumes contain. This information may
+ * be found from the scanning data.
+ *
+ * But it would still be beneficial to store this information in the volume
+ * table. For example, suppose we have a static volume X, and all its physical
+ * eraseblocks became bad for some reasons. Suppose we are attaching the
+ * corresponding MTD device, the scanning has found no logical eraseblocks
+ * corresponding to the volume X. According to the volume table volume X does
+ * exist. So we don't know whether it is just empty or all its physical
+ * eraseblocks went bad. So we cannot alarm the user about this corruption.
+ *
+ * The volume table also stores so-called "update marker", which is used for
+ * volume updates. Before updating the volume, the update marker is set, and
+ * after the update operation is finished, the update marker is cleared. So if
+ * the update operation was interrupted (e.g. by an unclean reboot) - the
+ * update marker is still there and we know that the volume's contents is
+ * damaged.
+ */
+#include <linux/crc32.h>
+#include <linux/err.h>
+#include <asm/div64.h>
+#include "ubi.h"
+#ifdef CONFIG_MTD_UBI_DEBUG_PARANOID
+static void paranoid_vtbl_check(const struct ubi_device *ubi);
+#else
+#define paranoid_vtbl_check(ubi)
+#endif
+/* Empty volume table record */
+static struct ubi_vtbl_record empty_vtbl_record;
+/**
+ * ubi_change_vtbl_record - change volume table record.
+ * @ubi: UBI device description object
+ * @idx: table index to change
+ * @vtbl_rec: new volume table record
+ *
+ * This function changes volume table record @idx. If @vtbl_rec is %NULL, empty
+ * volume table record is written. The caller does not have to calculate CRC of
+ * the record as it is done by this function. Returns zero in case of success
+ * and a negative error code in case of failure.
+ */
+int ubi_change_vtbl_record(struct ubi_device *ubi, int idx,
+                           struct ubi_vtbl_record *vtbl_rec)
+{
+        int i, err;
+        uint32_t crc;
+        ubi_assert(idx >= 0 && idx < ubi->vtbl_slots);
+        if (!vtbl_rec)
+                vtbl_rec = &empty_vtbl_record;
+        else {
+                crc = crc32(UBI_CRC32_INIT, vtbl_rec, UBI_VTBL_RECORD_SIZE_CRC);
+                vtbl_rec->crc = cpu_to_ubi32(crc);
+        }
+        dbg_msg("change record %d", idx);
+        ubi_dbg_dump_vtbl_record(vtbl_rec, idx);
+        mutex_lock(&ubi->vtbl_mutex);
+        memcpy(&ubi->vtbl[idx], vtbl_rec, sizeof(struct ubi_vtbl_record));
+        for (i = 0; i < UBI_LAYOUT_VOLUME_EBS; i++) {
+                err = ubi_eba_unmap_leb(ubi, UBI_LAYOUT_VOL_ID, i);
+                if (err) {
+                        mutex_unlock(&ubi->vtbl_mutex);
+                        return err;
+                }
+                err = ubi_eba_write_leb(ubi, UBI_LAYOUT_VOL_ID, i, ubi->vtbl, 0,
+                                        ubi->vtbl_size, UBI_LONGTERM);
+                if (err) {
+                        mutex_unlock(&ubi->vtbl_mutex);
+                        return err;
+                }
+        }
+        paranoid_vtbl_check(ubi);
+        mutex_unlock(&ubi->vtbl_mutex);
+        return ubi_wl_flush(ubi);
+}
+/**
+ * vol_til_check - check if volume table is not corrupted and contains sensible
+ * data.
+ *
+ * @ubi: UBI device description object
+ * @vtbl: volume table
+ *
+ * This function returns zero if @vtbl is all right, %1 if CRC is incorrect,
+ * and %-EINVAL if it contains inconsistent data.
+ */
+static int vtbl_check(const struct ubi_device *ubi,
+                      const struct ubi_vtbl_record *vtbl)
+{
+        int i, n, reserved_pebs, alignment, data_pad, vol_type, name_len;
+        int upd_marker;
+        uint32_t crc;
+        const char *name;
+        for (i = 0; i < ubi->vtbl_slots; i++) {
+                cond_resched();
+                reserved_pebs = ubi32_to_cpu(vtbl[i].reserved_pebs);
+                alignment = ubi32_to_cpu(vtbl[i].alignment);
+                data_pad = ubi32_to_cpu(vtbl[i].data_pad);
+                upd_marker = vtbl[i].upd_marker;
+                vol_type = vtbl[i].vol_type;
+                name_len = ubi16_to_cpu(vtbl[i].name_len);
+                name = &vtbl[i].name[0];
+                crc = crc32(UBI_CRC32_INIT, &vtbl[i], UBI_VTBL_RECORD_SIZE_CRC);
+                if (ubi32_to_cpu(vtbl[i].crc) != crc) {
+                        ubi_err("bad CRC at record %u: %#08x, not %#08x",
+                                 i, crc, ubi32_to_cpu(vtbl[i].crc));
+                        ubi_dbg_dump_vtbl_record(&vtbl[i], i);
+                        return 1;
+                }
+                if (reserved_pebs == 0) {
+                        if (memcmp(&vtbl[i], &empty_vtbl_record,
+                                                UBI_VTBL_RECORD_SIZE)) {
+                                dbg_err("bad empty record");
+                                goto bad;
+                        }
+                        continue;
+                }
+                if (reserved_pebs < 0 || alignment < 0 || data_pad < 0 ||
+                    name_len < 0) {
+                        dbg_err("negative values");
+                        goto bad;
+                }
+                if (alignment > ubi->leb_size || alignment == 0) {
+                        dbg_err("bad alignment");
+                        goto bad;
+                }
+                n = alignment % ubi->min_io_size;
+                if (alignment != 1 && n) {
+                        dbg_err("alignment is not multiple of min I/O unit");
+                        goto bad;
+                }
+                n = ubi->leb_size % alignment;
+                if (data_pad != n) {
+                        dbg_err("bad data_pad, has to be %d", n);
+                        goto bad;
+                }
+                if (vol_type != UBI_VID_DYNAMIC && vol_type != UBI_VID_STATIC) {
+                        dbg_err("bad vol_type");
+                        goto bad;
+                }
+                if (upd_marker != 0 && upd_marker != 1) {
+                        dbg_err("bad upd_marker");
+                        goto bad;
+                }
+                if (reserved_pebs > ubi->good_peb_count) {
+                        dbg_err("too large reserved_pebs, good PEBs %d",
+                                ubi->good_peb_count);
+                        goto bad;
+                }
+                if (name_len > UBI_VOL_NAME_MAX) {
+                        dbg_err("too long volume name, max %d",
+                                UBI_VOL_NAME_MAX);
+                        goto bad;
+                }
+                if (name[0] == '\0') {
+                        dbg_err("NULL volume name");
+                        goto bad;
+                }
+                if (name_len != strnlen(name, name_len + 1)) {
+                        dbg_err("bad name_len");
+                        goto bad;
+                }
+        }
+        /* Checks that all names are unique */
+        for (i = 0; i < ubi->vtbl_slots - 1; i++) {
+                for (n = i + 1; n < ubi->vtbl_slots; n++) {
+                        int len1 = ubi16_to_cpu(vtbl[i].name_len);
+                        int len2 = ubi16_to_cpu(vtbl[n].name_len);
+                        if (len1 > 0 && len1 == len2 &&
+                            !strncmp(vtbl[i].name, vtbl[n].name, len1)) {
+                                ubi_err("volumes %d and %d have the same name"
+                                        " \"%s\"", i, n, vtbl[i].name);
+                                ubi_dbg_dump_vtbl_record(&vtbl[i], i);
+                                ubi_dbg_dump_vtbl_record(&vtbl[n], n);
+                                return -EINVAL;
+                        }
+                }
+        }
+        return 0;
+bad:
+        ubi_err("volume table check failed, record %d", i);
+        ubi_dbg_dump_vtbl_record(&vtbl[i], i);
+        return -EINVAL;
+}
+/**
+ * create_vtbl - create a copy of volume table.
+ * @ubi: UBI device description object
+ * @si: scanning information
+ * @copy: number of the volume table copy
+ * @vtbl: contents of the volume table
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int create_vtbl(const struct ubi_device *ubi, struct ubi_scan_info *si,
+                       int copy, void *vtbl)
+{
+        int err, tries = 0;
+        static struct ubi_vid_hdr *vid_hdr;
+        struct ubi_scan_volume *sv;
+        struct ubi_scan_leb *new_seb, *old_seb = NULL;
+        ubi_msg("create volume table (copy #%d)", copy + 1);
+        vid_hdr = ubi_zalloc_vid_hdr(ubi);
+        if (!vid_hdr)
+                return -ENOMEM;
+        /*
+         * Check if there is a logical eraseblock which would have to contain
+         * this volume table copy was found during scanning. It has to be wiped
+         * out.
+         */
+        sv = ubi_scan_find_sv(si, UBI_LAYOUT_VOL_ID);
+        if (sv)
+                old_seb = ubi_scan_find_seb(sv, copy);
+retry:
+        new_seb = ubi_scan_get_free_peb(ubi, si);
+        if (IS_ERR(new_seb)) {
+                err = PTR_ERR(new_seb);
+                goto out_free;
+        }
+        vid_hdr->vol_type = UBI_VID_DYNAMIC;
+        vid_hdr->vol_id = cpu_to_ubi32(UBI_LAYOUT_VOL_ID);
+        vid_hdr->compat = UBI_LAYOUT_VOLUME_COMPAT;
+        vid_hdr->data_size = vid_hdr->used_ebs =
+                             vid_hdr->data_pad = cpu_to_ubi32(0);
+        vid_hdr->lnum = cpu_to_ubi32(copy);
+        vid_hdr->sqnum = cpu_to_ubi64(++si->max_sqnum);
+        vid_hdr->leb_ver = cpu_to_ubi32(old_seb ? old_seb->leb_ver + 1: 0);
+        /* The EC header is already there, write the VID header */
+        err = ubi_io_write_vid_hdr(ubi, new_seb->pnum, vid_hdr);
+        if (err)
+                goto write_error;
+        /* Write the layout volume contents */
+        err = ubi_io_write_data(ubi, vtbl, new_seb->pnum, 0, ubi->vtbl_size);
+        if (err)
+                goto write_error;
+        /*
+         * And add it to the scanning information. Don't delete the old
+         * @old_seb as it will be deleted and freed in 'ubi_scan_add_used()'.
+         */
+        err = ubi_scan_add_used(ubi, si, new_seb->pnum, new_seb->ec,
+                                vid_hdr, 0);
+        kfree(new_seb);
+        ubi_free_vid_hdr(ubi, vid_hdr);
+        return err;
+write_error:
+        kfree(new_seb);
+        /* May be this physical eraseblock went bad, try to pick another one */
+        if (++tries <= 5) {
+                err = ubi_scan_add_to_list(si, new_seb->pnum, new_seb->ec,
+                                           &si->corr);
+                if (!err)
+                        goto retry;
+        }
+out_free:
+        ubi_free_vid_hdr(ubi, vid_hdr);
+        return err;
+}
+/**
+ * process_lvol - process the layout volume.
+ * @ubi: UBI device description object
+ * @si: scanning information
+ * @sv: layout volume scanning information
+ *
+ * This function is responsible for reading the layout volume, ensuring it is
+ * not corrupted, and recovering from corruptions if needed. Returns volume
+ * table in case of success and a negative error code in case of failure.
+ */
+static struct ubi_vtbl_record *process_lvol(const struct ubi_device *ubi,
+                                            struct ubi_scan_info *si,
+                                            struct ubi_scan_volume *sv)
+{
+        int err;
+        struct rb_node *rb;
+        struct ubi_scan_leb *seb;
+        struct ubi_vtbl_record *leb[UBI_LAYOUT_VOLUME_EBS] = { NULL, NULL };
+        int leb_corrupted[UBI_LAYOUT_VOLUME_EBS] = {1, 1};
+        /*
+         * UBI goes through the following steps when it changes the layout
+         * volume:
+         * a. erase LEB 0;
+         * b. write new data to LEB 0;
+         * c. erase LEB 1;
+         * d. write new data to LEB 1.
+         *
+         * Before the change, both LEBs contain the same data.
+         *
+         * Due to unclean reboots, the contents of LEB 0 may be lost, but there
+         * should LEB 1. So it is OK if LEB 0 is corrupted while LEB 1 is not.
+         * Similarly, LEB 1 may be lost, but there should be LEB 0. And
+         * finally, unclean reboots may result in a situation when neither LEB
+         * 0 nor LEB 1 are corrupted, but they are different. In this case, LEB
+         * 0 contains more recent information.
+         *
+         * So the plan is to first check LEB 0. Then
+         * a. if LEB 0 is OK, it must be containing the most resent data; then
+         *    we compare it with LEB 1, and if they are different, we copy LEB
+         *    0 to LEB 1;
+         * b. if LEB 0 is corrupted, but LEB 1 has to be OK, and we copy LEB 1
+         *    to LEB 0.
+         */
+        dbg_msg("check layout volume");
+        /* Read both LEB 0 and LEB 1 into memory */
+        ubi_rb_for_each_entry(rb, seb, &sv->root, u.rb) {
+                leb[seb->lnum] = kzalloc(ubi->vtbl_size, GFP_KERNEL);
+                if (!leb[seb->lnum]) {
+                        err = -ENOMEM;
+                        goto out_free;
+                }
+                err = ubi_io_read_data(ubi, leb[seb->lnum], seb->pnum, 0,
+                                       ubi->vtbl_size);
+                if (err == UBI_IO_BITFLIPS || err == -EBADMSG)
+                        /* Scrub the PEB later */
+                        seb->scrub = 1;
+                else if (err)
+                        goto out_free;
+        }
+        err = -EINVAL;
+        if (leb[0]) {
+                leb_corrupted[0] = vtbl_check(ubi, leb[0]);
+                if (leb_corrupted[0] < 0)
+                        goto out_free;
+        }
+        if (!leb_corrupted[0]) {
+                /* LEB 0 is OK */
+                if (leb[1])
+                        leb_corrupted[1] = memcmp(leb[0], leb[1], ubi->vtbl_size);
+                if (leb_corrupted[1]) {
+                        ubi_warn("volume table copy #2 is corrupted");
+                        err = create_vtbl(ubi, si, 1, leb[0]);
+                        if (err)
+                                goto out_free;
+                        ubi_msg("volume table was restored");
+                }
+                /* Both LEB 1 and LEB 2 are OK and consistent */
+                kfree(leb[1]);
+                return leb[0];
+        } else {
+                /* LEB 0 is corrupted or does not exist */
+                if (leb[1]) {
+                        leb_corrupted[1] = vtbl_check(ubi, leb[1]);
+                        if (leb_corrupted[1] < 0)
+                                goto out_free;
+                }
+                if (leb_corrupted[1]) {
+                        /* Both LEB 0 and LEB 1 are corrupted */
+                        ubi_err("both volume tables are corrupted");
+                        goto out_free;
+                }
+                ubi_warn("volume table copy #1 is corrupted");
+                err = create_vtbl(ubi, si, 0, leb[1]);
+                if (err)
+                        goto out_free;
+                ubi_msg("volume table was restored");
+                kfree(leb[0]);
+                return leb[1];
+        }
+out_free:
+        kfree(leb[0]);
+        kfree(leb[1]);
+        return ERR_PTR(err);
+}
+/**
+ * create_empty_lvol - create empty layout volume.
+ * @ubi: UBI device description object
+ * @si: scanning information
+ *
+ * This function returns volume table contents in case of success and a
+ * negative error code in case of failure.
+ */
+static struct ubi_vtbl_record *create_empty_lvol(const struct ubi_device *ubi,
+                                                 struct ubi_scan_info *si)
+{
+        int i;
+        struct ubi_vtbl_record *vtbl;
+        vtbl = kzalloc(ubi->vtbl_size, GFP_KERNEL);
+        if (!vtbl)
+                return ERR_PTR(-ENOMEM);
+        for (i = 0; i < ubi->vtbl_slots; i++)
+                memcpy(&vtbl[i], &empty_vtbl_record, UBI_VTBL_RECORD_SIZE);
+        for (i = 0; i < UBI_LAYOUT_VOLUME_EBS; i++) {
+                int err;
+                err = create_vtbl(ubi, si, i, vtbl);
+                if (err) {
+                        kfree(vtbl);
+                        return ERR_PTR(err);
+                }
+        }
+        return vtbl;
+}
+/**
+ * init_volumes - initialize volume information for existing volumes.
+ * @ubi: UBI device description object
+ * @si: scanning information
+ * @vtbl: volume table
+ *
+ * This function allocates volume description objects for existing volumes.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static int init_volumes(struct ubi_device *ubi, const struct ubi_scan_info *si,
+                        const struct ubi_vtbl_record *vtbl)
+{
+        int i, reserved_pebs = 0;
+        struct ubi_scan_volume *sv;
+        struct ubi_volume *vol;
+        for (i = 0; i < ubi->vtbl_slots; i++) {
+                cond_resched();
+                if (ubi32_to_cpu(vtbl[i].reserved_pebs) == 0)
+                        continue; /* Empty record */
+                vol = kzalloc(sizeof(struct ubi_volume), GFP_KERNEL);
+                if (!vol)
+                        return -ENOMEM;
+                vol->reserved_pebs = ubi32_to_cpu(vtbl[i].reserved_pebs);
+                vol->alignment = ubi32_to_cpu(vtbl[i].alignment);
+                vol->data_pad = ubi32_to_cpu(vtbl[i].data_pad);
+                vol->vol_type = vtbl[i].vol_type == UBI_VID_DYNAMIC ?
+                                        UBI_DYNAMIC_VOLUME : UBI_STATIC_VOLUME;
+                vol->name_len = ubi16_to_cpu(vtbl[i].name_len);
+                vol->usable_leb_size = ubi->leb_size - vol->data_pad;
+                memcpy(vol->name, vtbl[i].name, vol->name_len);
+                vol->name[vol->name_len] = '\0';
+                vol->vol_id = i;
+                ubi_assert(!ubi->volumes[i]);
+                ubi->volumes[i] = vol;
+                ubi->vol_count += 1;
+                vol->ubi = ubi;
+                reserved_pebs += vol->reserved_pebs;
+                /*
+                 * In case of dynamic volume UBI knows nothing about how many
+                 * data is stored there. So assume the whole volume is used.
+                 */
+                if (vol->vol_type == UBI_DYNAMIC_VOLUME) {
+                        vol->used_ebs = vol->reserved_pebs;
+                        vol->last_eb_bytes = vol->usable_leb_size;
+                        vol->used_bytes = vol->used_ebs * vol->usable_leb_size;
+                        continue;
+                }
+                /* Static volumes only */
+                sv = ubi_scan_find_sv(si, i);
+                if (!sv) {
+                        /*
+                         * No eraseblocks belonging to this volume found. We
+                         * don't actually know whether this static volume is
+                         * completely corrupted or just contains no data. And
+                         * we cannot know this as long as data size is not
+                         * stored on flash. So we just assume the volume is
+                         * empty. FIXME: this should be handled.
+                         */
+                        continue;
+                }
+                if (sv->leb_count != sv->used_ebs) {
+                        /*
+                         * We found a static volume which misses several
+                         * eraseblocks. Treat it as corrupted.
+                         */
+                        ubi_warn("static volume %d misses %d LEBs - corrupted",
+                                 sv->vol_id, sv->used_ebs - sv->leb_count);
+                        vol->corrupted = 1;
+                        continue;
+                }
+                vol->used_ebs = sv->used_ebs;
+                vol->used_bytes = (vol->used_ebs - 1) * vol->usable_leb_size;
+                vol->used_bytes += sv->last_data_size;
+                vol->last_eb_bytes = sv->last_data_size;
+        }
+        vol = kzalloc(sizeof(struct ubi_volume), GFP_KERNEL);
+        if (!vol)
+                return -ENOMEM;
+        vol->reserved_pebs = UBI_LAYOUT_VOLUME_EBS;
+        vol->alignment = 1;
+        vol->vol_type = UBI_DYNAMIC_VOLUME;
+        vol->name_len = sizeof(UBI_LAYOUT_VOLUME_NAME) - 1;
+        memcpy(vol->name, UBI_LAYOUT_VOLUME_NAME, vol->name_len + 1);
+        vol->usable_leb_size = ubi->leb_size;
+        vol->used_ebs = vol->reserved_pebs;
+        vol->last_eb_bytes = vol->reserved_pebs;
+        vol->used_bytes = vol->used_ebs * (ubi->leb_size - vol->data_pad);
+        vol->vol_id = UBI_LAYOUT_VOL_ID;
+        ubi_assert(!ubi->volumes[i]);
+        ubi->volumes[vol_id2idx(ubi, vol->vol_id)] = vol;
+        reserved_pebs += vol->reserved_pebs;
+        ubi->vol_count += 1;
+        vol->ubi = ubi;
+        if (reserved_pebs > ubi->avail_pebs)
+                ubi_err("not enough PEBs, required %d, available %d",
+                        reserved_pebs, ubi->avail_pebs);
+        ubi->rsvd_pebs += reserved_pebs;
+        ubi->avail_pebs -= reserved_pebs;
+        return 0;
+}
+/**
+ * check_sv - check volume scanning information.
+ * @vol: UBI volume description object
+ * @sv: volume scanning information
+ *
+ * This function returns zero if the volume scanning information is consistent
+ * to the data read from the volume tabla, and %-EINVAL if not.
+ */
+static int check_sv(const struct ubi_volume *vol,
+                    const struct ubi_scan_volume *sv)
+{
+        if (sv->highest_lnum >= vol->reserved_pebs) {
+                dbg_err("bad highest_lnum");
+                goto bad;
+        }
+        if (sv->leb_count > vol->reserved_pebs) {
+                dbg_err("bad leb_count");
+                goto bad;
+        }
+        if (sv->vol_type != vol->vol_type) {
+                dbg_err("bad vol_type");
+                goto bad;
+        }
+        if (sv->used_ebs > vol->reserved_pebs) {
+                dbg_err("bad used_ebs");
+                goto bad;
+        }
+        if (sv->data_pad != vol->data_pad) {
+                dbg_err("bad data_pad");
+                goto bad;
+        }
+        return 0;
+bad:
+        ubi_err("bad scanning information");
+        ubi_dbg_dump_sv(sv);
+        ubi_dbg_dump_vol_info(vol);
+        return -EINVAL;
+}
+/**
+ * check_scanning_info - check that scanning information.
+ * @ubi: UBI device description object
+ * @si: scanning information
+ *
+ * Even though we protect on-flash data by CRC checksums, we still don't trust
+ * the media. This function ensures that scanning information is consistent to
+ * the information read from the volume table. Returns zero if the scanning
+ * information is OK and %-EINVAL if it is not.
+ */
+static int check_scanning_info(const struct ubi_device *ubi,
+                               struct ubi_scan_info *si)
+{
+        int err, i;
+        struct ubi_scan_volume *sv;
+        struct ubi_volume *vol;
+        if (si->vols_found > UBI_INT_VOL_COUNT + ubi->vtbl_slots) {
+                ubi_err("scanning found %d volumes, maximum is %d + %d",
+                        si->vols_found, UBI_INT_VOL_COUNT, ubi->vtbl_slots);
+                return -EINVAL;
+        }
+        if (si->highest_vol_id >= ubi->vtbl_slots + UBI_INT_VOL_COUNT&&
+            si->highest_vol_id < UBI_INTERNAL_VOL_START) {
+                ubi_err("too large volume ID %d found by scanning",
+                        si->highest_vol_id);
+                return -EINVAL;
+        }
+        for (i = 0; i < ubi->vtbl_slots + UBI_INT_VOL_COUNT; i++) {
+                cond_resched();
+                sv = ubi_scan_find_sv(si, i);
+                vol = ubi->volumes[i];
+                if (!vol) {
+                        if (sv)
+                                ubi_scan_rm_volume(si, sv);
+                        continue;
+                }
+                if (vol->reserved_pebs == 0) {
+                        ubi_assert(i < ubi->vtbl_slots);
+                        if (!sv)
+                                continue;
+                        /*
+                         * During scanning we found a volume which does not
+                         * exist according to the information in the volume
+                         * table. This must have happened due to an unclean
+                         * reboot while the volume was being removed. Discard
+                         * these eraseblocks.
+                         */
+                        ubi_msg("finish volume %d removal", sv->vol_id);
+                        ubi_scan_rm_volume(si, sv);
+                } else if (sv) {
+                        err = check_sv(vol, sv);
+                        if (err)
+                                return err;
+                }
+        }
+        return 0;
+}
+/**
+ * ubi_read_volume_table - read volume table.
+ * information.
+ * @ubi: UBI device description object
+ * @si: scanning information
+ *
+ * This function reads volume table, checks it, recover from errors if needed,
+ * or creates it if needed. Returns zero in case of success and a negative
+ * error code in case of failure.
+ */
+int ubi_read_volume_table(struct ubi_device *ubi, struct ubi_scan_info *si)
+{
+        int i, err;
+        struct ubi_scan_volume *sv;
+        empty_vtbl_record.crc = cpu_to_ubi32(0xf116c36b);
+        /*
+         * The number of supported volumes is limited by the eraseblock size
+         * and by the UBI_MAX_VOLUMES constant.
+         */
+        ubi->vtbl_slots = ubi->leb_size / UBI_VTBL_RECORD_SIZE;
+        if (ubi->vtbl_slots > UBI_MAX_VOLUMES)
+                ubi->vtbl_slots = UBI_MAX_VOLUMES;
+        ubi->vtbl_size = ubi->vtbl_slots * UBI_VTBL_RECORD_SIZE;
+        ubi->vtbl_size = ALIGN(ubi->vtbl_size, ubi->min_io_size);
+        sv = ubi_scan_find_sv(si, UBI_LAYOUT_VOL_ID);
+        if (!sv) {
+                /*
+                 * No logical eraseblocks belonging to the layout volume were
+                 * found. This could mean that the flash is just empty. In
+                 * this case we create empty layout volume.
+                 *
+                 * But if flash is not empty this must be a corruption or the
+                 * MTD device just contains garbage.
+                 */
+                if (si->is_empty) {
+                        ubi->vtbl = create_empty_lvol(ubi, si);
+                        if (IS_ERR(ubi->vtbl))
+                                return PTR_ERR(ubi->vtbl);
+                } else {
+                        ubi_err("the layout volume was not found");
+                        return -EINVAL;
+                }
+        } else {
+                if (sv->leb_count > UBI_LAYOUT_VOLUME_EBS) {
+                        /* This must not happen with proper UBI images */
+                        dbg_err("too many LEBs (%d) in layout volume",
+                                sv->leb_count);
+                        return -EINVAL;
+                }
+                ubi->vtbl = process_lvol(ubi, si, sv);
+                if (IS_ERR(ubi->vtbl))
+                        return PTR_ERR(ubi->vtbl);
+        }
+        ubi->avail_pebs = ubi->good_peb_count;
+        /*
+         * The layout volume is OK, initialize the corresponding in-RAM data
+         * structures.
+         */
+        err = init_volumes(ubi, si, ubi->vtbl);
+        if (err)
+                goto out_free;
+        /*
+         * Get sure that the scanning information is consistent to the
+         * information stored in the volume table.
+         */
+        err = check_scanning_info(ubi, si);
+        if (err)
+                goto out_free;
+        return 0;
+out_free:
+        kfree(ubi->vtbl);
+        for (i = 0; i < ubi->vtbl_slots + UBI_INT_VOL_COUNT; i++)
+                if (ubi->volumes[i]) {
+                        kfree(ubi->volumes[i]);
+                        ubi->volumes[i] = NULL;
+                }
+        return err;
+}
+#ifdef CONFIG_MTD_UBI_DEBUG_PARANOID
+/**
+ * paranoid_vtbl_check - check volume table.
+ * @ubi: UBI device description object
+ */
+static void paranoid_vtbl_check(const struct ubi_device *ubi)
+{
+        if (vtbl_check(ubi, ubi->vtbl)) {
+                ubi_err("paranoid check failed");
+                BUG();
+        }
+}
+#endif /* CONFIG_MTD_UBI_DEBUG_PARANOID */
diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c
new file mode 100644
index 000000000000..9ecaf77eca9e
--- /dev/null
+++ b/drivers/mtd/ubi/wl.c
@@ -0,0 +1,1671 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём), Thomas Gleixner
+ */
+/*
+ * UBI wear-leveling unit.
+ *
+ * This unit is responsible for wear-leveling. It works in terms of physical
+ * eraseblocks and erase counters and knows nothing about logical eraseblocks,
+ * volumes, etc. From this unit's perspective all physical eraseblocks are of
+ * two types - used and free. Used physical eraseblocks are those that were
+ * "get" by the 'ubi_wl_get_peb()' function, and free physical eraseblocks are
+ * those that were put by the 'ubi_wl_put_peb()' function.
+ *
+ * Physical eraseblocks returned by 'ubi_wl_get_peb()' have only erase counter
+ * header. The rest of the physical eraseblock contains only 0xFF bytes.
+ *
+ * When physical eraseblocks are returned to the WL unit by means of the
+ * 'ubi_wl_put_peb()' function, they are scheduled for erasure. The erasure is
+ * done asynchronously in context of the per-UBI device background thread,
+ * which is also managed by the WL unit.
+ *
+ * The wear-leveling is ensured by means of moving the contents of used
+ * physical eraseblocks with low erase counter to free physical eraseblocks
+ * with high erase counter.
+ *
+ * The 'ubi_wl_get_peb()' function accepts data type hints which help to pick
+ * an "optimal" physical eraseblock. For example, when it is known that the
+ * physical eraseblock will be "put" soon because it contains short-term data,
+ * the WL unit may pick a free physical eraseblock with low erase counter, and
+ * so forth.
+ *
+ * If the WL unit fails to erase a physical eraseblock, it marks it as bad.
+ *
+ * This unit is also responsible for scrubbing. If a bit-flip is detected in a
+ * physical eraseblock, it has to be moved. Technically this is the same as
+ * moving it for wear-leveling reasons.
+ *
+ * As it was said, for the UBI unit all physical eraseblocks are either "free"
+ * or "used". Free eraseblock are kept in the @wl->free RB-tree, while used
+ * eraseblocks are kept in a set of different RB-trees: @wl->used,
+ * @wl->prot.pnum, @wl->prot.aec, and @wl->scrub.
+ *
+ * Note, in this implementation, we keep a small in-RAM object for each physical
+ * eraseblock. This is surely not a scalable solution. But it appears to be good
+ * enough for moderately large flashes and it is simple. In future, one may
+ * re-work this unit and make it more scalable.
+ *
+ * At the moment this unit does not utilize the sequence number, which was
+ * introduced relatively recently. But it would be wise to do this because the
+ * sequence number of a logical eraseblock characterizes how old is it. For
+ * example, when we move a PEB with low erase counter, and we need to pick the
+ * target PEB, we pick a PEB with the highest EC if our PEB is "old" and we
+ * pick target PEB with an average EC if our PEB is not very "old". This is a
+ * room for future re-works of the WL unit.
+ *
+ * FIXME: looks too complex, should be simplified (later).
+ */
+#include <linux/slab.h>
+#include <linux/crc32.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include "ubi.h"
+/* Number of physical eraseblocks reserved for wear-leveling purposes */
+#define WL_RESERVED_PEBS 1
+/*
+ * How many erase cycles are short term, unknown, and long term physical
+ * eraseblocks protected.
+ */
+#define ST_PROTECTION 16
+#define U_PROTECTION  10
+#define LT_PROTECTION 4
+/*
+ * Maximum difference between two erase counters. If this threshold is
+ * exceeded, the WL unit starts moving data from used physical eraseblocks with
+ * low erase counter to free physical eraseblocks with high erase counter.
+ */
+#define UBI_WL_THRESHOLD CONFIG_MTD_UBI_WL_THRESHOLD
+/*
+ * When a physical eraseblock is moved, the WL unit has to pick the target
+ * physical eraseblock to move to. The simplest way would be just to pick the
+ * one with the highest erase counter. But in certain workloads this could lead
+ * to an unlimited wear of one or few physical eraseblock. Indeed, imagine a
+ * situation when the picked physical eraseblock is constantly erased after the
+ * data is written to it. So, we have a constant which limits the highest erase
+ * counter of the free physical eraseblock to pick. Namely, the WL unit does
+ * not pick eraseblocks with erase counter greater then the lowest erase
+ * counter plus %WL_FREE_MAX_DIFF.
+ */
+#define WL_FREE_MAX_DIFF (2*UBI_WL_THRESHOLD)
+/*
+ * Maximum number of consecutive background thread failures which is enough to
+ * switch to read-only mode.
+ */
+#define WL_MAX_FAILURES 32
+/**
+ * struct ubi_wl_entry - wear-leveling entry.
+ * @rb: link in the corresponding RB-tree
+ * @ec: erase counter
+ * @pnum: physical eraseblock number
+ *
+ * Each physical eraseblock has a corresponding &struct wl_entry object which
+ * may be kept in different RB-trees.
+ */
+struct ubi_wl_entry {
+        struct rb_node rb;
+        int ec;
+        int pnum;
+};
+/**
+ * struct ubi_wl_prot_entry - PEB protection entry.
+ * @rb_pnum: link in the @wl->prot.pnum RB-tree
+ * @rb_aec: link in the @wl->prot.aec RB-tree
+ * @abs_ec: the absolute erase counter value when the protection ends
+ * @e: the wear-leveling entry of the physical eraseblock under protection
+ *
+ * When the WL unit returns a physical eraseblock, the physical eraseblock is
+ * protected from being moved for some "time". For this reason, the physical
+ * eraseblock is not directly moved from the @wl->free tree to the @wl->used
+ * tree. There is one more tree in between where this physical eraseblock is
+ * temporarily stored (@wl->prot).
+ *
+ * All this protection stuff is needed because:
+ *  o we don't want to move physical eraseblocks just after we have given them
+ *    to the user; instead, we first want to let users fill them up with data;
+ *
+ *  o there is a chance that the user will put the physical eraseblock very
+ *    soon, so it makes sense not to move it for some time, but wait; this is
+ *    especially important in case of "short term" physical eraseblocks.
+ *
+ * Physical eraseblocks stay protected only for limited time. But the "time" is
+ * measured in erase cycles in this case. This is implemented with help of the
+ * absolute erase counter (@wl->abs_ec). When it reaches certain value, the
+ * physical eraseblocks are moved from the protection trees (@wl->prot.*) to
+ * the @wl->used tree.
+ *
+ * Protected physical eraseblocks are searched by physical eraseblock number
+ * (when they are put) and by the absolute erase counter (to check if it is
+ * time to move them to the @wl->used tree). So there are actually 2 RB-trees
+ * storing the protected physical eraseblocks: @wl->prot.pnum and
+ * @wl->prot.aec. They are referred to as the "protection" trees. The
+ * first one is indexed by the physical eraseblock number. The second one is
+ * indexed by the absolute erase counter. Both trees store
+ * &struct ubi_wl_prot_entry objects.
+ *
+ * Each physical eraseblock has 2 main states: free and used. The former state
+ * corresponds to the @wl->free tree. The latter state is split up on several
+ * sub-states:
+ * o the WL movement is allowed (@wl->used tree);
+ * o the WL movement is temporarily prohibited (@wl->prot.pnum and
+ * @wl->prot.aec trees);
+ * o scrubbing is needed (@wl->scrub tree).
+ *
+ * Depending on the sub-state, wear-leveling entries of the used physical
+ * eraseblocks may be kept in one of those trees.
+ */
+struct ubi_wl_prot_entry {
+        struct rb_node rb_pnum;
+        struct rb_node rb_aec;
+        unsigned long long abs_ec;
+        struct ubi_wl_entry *e;
+};
+/**
+ * struct ubi_work - UBI work description data structure.
+ * @list: a link in the list of pending works
+ * @func: worker function
+ * @priv: private data of the worker function
+ *
+ * @e: physical eraseblock to erase
+ * @torture: if the physical eraseblock has to be tortured
+ *
+ * The @func pointer points to the worker function. If the @cancel argument is
+ * not zero, the worker has to free the resources and exit immediately. The
+ * worker has to return zero in case of success and a negative error code in
+ * case of failure.
+ */
+struct ubi_work {
+        struct list_head list;
+        int (*func)(struct ubi_device *ubi, struct ubi_work *wrk, int cancel);
+        /* The below fields are only relevant to erasure works */
+        struct ubi_wl_entry *e;
+        int torture;
+};
+#ifdef CONFIG_MTD_UBI_DEBUG_PARANOID
+static int paranoid_check_ec(const struct ubi_device *ubi, int pnum, int ec);
+static int paranoid_check_in_wl_tree(struct ubi_wl_entry *e,
+                                     struct rb_root *root);
+#else
+#define paranoid_check_ec(ubi, pnum, ec) 0
+#define paranoid_check_in_wl_tree(e, root)
+#endif
+/* Slab cache for wear-leveling entries */
+static struct kmem_cache *wl_entries_slab;
+/**
+ * tree_empty - a helper function to check if an RB-tree is empty.
+ * @root: the root of the tree
+ *
+ * This function returns non-zero if the RB-tree is empty and zero if not.
+ */
+static inline int tree_empty(struct rb_root *root)
+{
+        return root->rb_node == NULL;
+}
+/**
+ * wl_tree_add - add a wear-leveling entry to a WL RB-tree.
+ * @e: the wear-leveling entry to add
+ * @root: the root of the tree
+ *
+ * Note, we use (erase counter, physical eraseblock number) pairs as keys in
+ * the @ubi->used and @ubi->free RB-trees.
+ */
+static void wl_tree_add(struct ubi_wl_entry *e, struct rb_root *root)
+{
+        struct rb_node **p, *parent = NULL;
+        p = &root->rb_node;
+        while (*p) {
+                struct ubi_wl_entry *e1;
+                parent = *p;
+                e1 = rb_entry(parent, struct ubi_wl_entry, rb);
+                if (e->ec < e1->ec)
+                        p = &(*p)->rb_left;
+                else if (e->ec > e1->ec)
+                        p = &(*p)->rb_right;
+                else {
+                        ubi_assert(e->pnum != e1->pnum);
+                        if (e->pnum < e1->pnum)
+                                p = &(*p)->rb_left;
+                        else
+                                p = &(*p)->rb_right;
+                }
+        }
+        rb_link_node(&e->rb, parent, p);
+        rb_insert_color(&e->rb, root);
+}
+/*
+ * Helper functions to add and delete wear-leveling entries from different
+ * trees.
+ */
+static void free_tree_add(struct ubi_device *ubi, struct ubi_wl_entry *e)
+{
+        wl_tree_add(e, &ubi->free);
+}
+static inline void used_tree_add(struct ubi_device *ubi,
+                                 struct ubi_wl_entry *e)
+{
+        wl_tree_add(e, &ubi->used);
+}
+static inline void scrub_tree_add(struct ubi_device *ubi,
+                                  struct ubi_wl_entry *e)
+{
+        wl_tree_add(e, &ubi->scrub);
+}
+static inline void free_tree_del(struct ubi_device *ubi,
+                                 struct ubi_wl_entry *e)
+{
+        paranoid_check_in_wl_tree(e, &ubi->free);
+        rb_erase(&e->rb, &ubi->free);
+}
+static inline void used_tree_del(struct ubi_device *ubi,
+                                 struct ubi_wl_entry *e)
+{
+        paranoid_check_in_wl_tree(e, &ubi->used);
+        rb_erase(&e->rb, &ubi->used);
+}
+static inline void scrub_tree_del(struct ubi_device *ubi,
+                                  struct ubi_wl_entry *e)
+{
+        paranoid_check_in_wl_tree(e, &ubi->scrub);
+        rb_erase(&e->rb, &ubi->scrub);
+}
+/**
+ * do_work - do one pending work.
+ * @ubi: UBI device description object
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int do_work(struct ubi_device *ubi)
+{
+        int err;
+        struct ubi_work *wrk;
+        spin_lock(&ubi->wl_lock);
+        if (list_empty(&ubi->works)) {
+                spin_unlock(&ubi->wl_lock);
+                return 0;
+        }
+        wrk = list_entry(ubi->works.next, struct ubi_work, list);
+        list_del(&wrk->list);
+        spin_unlock(&ubi->wl_lock);
+        /*
+         * Call the worker function. Do not touch the work structure
+         * after this call as it will have been freed or reused by that
+         * time by the worker function.
+         */
+        err = wrk->func(ubi, wrk, 0);
+        if (err)
+                ubi_err("work failed with error code %d", err);
+        spin_lock(&ubi->wl_lock);
+        ubi->works_count -= 1;
+        ubi_assert(ubi->works_count >= 0);
+        spin_unlock(&ubi->wl_lock);
+        return err;
+}
+/**
+ * produce_free_peb - produce a free physical eraseblock.
+ * @ubi: UBI device description object
+ *
+ * This function tries to make a free PEB by means of synchronous execution of
+ * pending works. This may be needed if, for example the background thread is
+ * disabled. Returns zero in case of success and a negative error code in case
+ * of failure.
+ */
+static int produce_free_peb(struct ubi_device *ubi)
+{
+        int err;
+        spin_lock(&ubi->wl_lock);
+        while (tree_empty(&ubi->free)) {
+                spin_unlock(&ubi->wl_lock);
+                dbg_wl("do one work synchronously");
+                err = do_work(ubi);
+                if (err)
+                        return err;
+                spin_lock(&ubi->wl_lock);
+        }
+        spin_unlock(&ubi->wl_lock);
+        return 0;
+}
+/**
+ * in_wl_tree - check if wear-leveling entry is present in a WL RB-tree.
+ * @e: the wear-leveling entry to check
+ * @root: the root of the tree
+ *
+ * This function returns non-zero if @e is in the @root RB-tree and zero if it
+ * is not.
+ */
+static int in_wl_tree(struct ubi_wl_entry *e, struct rb_root *root)
+{
+        struct rb_node *p;
+        p = root->rb_node;
+        while (p) {
+                struct ubi_wl_entry *e1;
+                e1 = rb_entry(p, struct ubi_wl_entry, rb);
+                if (e->pnum == e1->pnum) {
+                        ubi_assert(e == e1);
+                        return 1;
+                }
+                if (e->ec < e1->ec)
+                        p = p->rb_left;
+                else if (e->ec > e1->ec)
+                        p = p->rb_right;
+                else {
+                        ubi_assert(e->pnum != e1->pnum);
+                        if (e->pnum < e1->pnum)
+                                p = p->rb_left;
+                        else
+                                p = p->rb_right;
+                }
+        }
+        return 0;
+}
+/**
+ * prot_tree_add - add physical eraseblock to protection trees.
+ * @ubi: UBI device description object
+ * @e: the physical eraseblock to add
+ * @pe: protection entry object to use
+ * @abs_ec: absolute erase counter value when this physical eraseblock has
+ * to be removed from the protection trees.
+ *
+ * @wl->lock has to be locked.
+ */
+static void prot_tree_add(struct ubi_device *ubi, struct ubi_wl_entry *e,
+                          struct ubi_wl_prot_entry *pe, int abs_ec)
+{
+        struct rb_node **p, *parent = NULL;
+        struct ubi_wl_prot_entry *pe1;
+        pe->e = e;
+        pe->abs_ec = ubi->abs_ec + abs_ec;
+        p = &ubi->prot.pnum.rb_node;
+        while (*p) {
+                parent = *p;
+                pe1 = rb_entry(parent, struct ubi_wl_prot_entry, rb_pnum);
+                if (e->pnum < pe1->e->pnum)
+                        p = &(*p)->rb_left;
+                else
+                        p = &(*p)->rb_right;
+        }
+        rb_link_node(&pe->rb_pnum, parent, p);
+        rb_insert_color(&pe->rb_pnum, &ubi->prot.pnum);
+        p = &ubi->prot.aec.rb_node;
+        parent = NULL;
+        while (*p) {
+                parent = *p;
+                pe1 = rb_entry(parent, struct ubi_wl_prot_entry, rb_aec);
+                if (pe->abs_ec < pe1->abs_ec)
+                        p = &(*p)->rb_left;
+                else
+                        p = &(*p)->rb_right;
+        }
+        rb_link_node(&pe->rb_aec, parent, p);
+        rb_insert_color(&pe->rb_aec, &ubi->prot.aec);
+}
+/**
+ * find_wl_entry - find wear-leveling entry closest to certain erase counter.
+ * @root: the RB-tree where to look for
+ * @max: highest possible erase counter
+ *
+ * This function looks for a wear leveling entry with erase counter closest to
+ * @max and less then @max.
+ */
+static struct ubi_wl_entry *find_wl_entry(struct rb_root *root, int max)
+{
+        struct rb_node *p;
+        struct ubi_wl_entry *e;
+        e = rb_entry(rb_first(root), struct ubi_wl_entry, rb);
+        max += e->ec;
+        p = root->rb_node;
+        while (p) {
+                struct ubi_wl_entry *e1;
+                e1 = rb_entry(p, struct ubi_wl_entry, rb);
+                if (e1->ec >= max)
+                        p = p->rb_left;
+                else {
+                        p = p->rb_right;
+                        e = e1;
+                }
+        }
+        return e;
+}
+/**
+ * ubi_wl_get_peb - get a physical eraseblock.
+ * @ubi: UBI device description object
+ * @dtype: type of data which will be stored in this physical eraseblock
+ *
+ * This function returns a physical eraseblock in case of success and a
+ * negative error code in case of failure. Might sleep.
+ */
+int ubi_wl_get_peb(struct ubi_device *ubi, int dtype)
+{
+        int err, protect, medium_ec;
+        struct ubi_wl_entry *e, *first, *last;
+        struct ubi_wl_prot_entry *pe;
+        ubi_assert(dtype == UBI_LONGTERM || dtype == UBI_SHORTTERM ||
+                   dtype == UBI_UNKNOWN);
+        pe = kmalloc(sizeof(struct ubi_wl_prot_entry), GFP_KERNEL);
+        if (!pe)
+                return -ENOMEM;
+retry:
+        spin_lock(&ubi->wl_lock);
+        if (tree_empty(&ubi->free)) {
+                if (ubi->works_count == 0) {
+                        ubi_assert(list_empty(&ubi->works));
+                        ubi_err("no free eraseblocks");
+                        spin_unlock(&ubi->wl_lock);
+                        kfree(pe);
+                        return -ENOSPC;
+                }
+                spin_unlock(&ubi->wl_lock);
+                err = produce_free_peb(ubi);
+                if (err < 0) {
+                        kfree(pe);
+                        return err;
+                }
+                goto retry;
+        }
+        switch (dtype) {
+                case UBI_LONGTERM:
+                        /*
+                         * For long term data we pick a physical eraseblock
+                         * with high erase counter. But the highest erase
+                         * counter we can pick is bounded by the the lowest
+                         * erase counter plus %WL_FREE_MAX_DIFF.
+                         */
+                        e = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);
+                        protect = LT_PROTECTION;
+                        break;
+                case UBI_UNKNOWN:
+                        /*
+                         * For unknown data we pick a physical eraseblock with
+                         * medium erase counter. But we by no means can pick a
+                         * physical eraseblock with erase counter greater or
+                         * equivalent than the lowest erase counter plus
+                         * %WL_FREE_MAX_DIFF.
+                         */
+                        first = rb_entry(rb_first(&ubi->free),
+                                         struct ubi_wl_entry, rb);
+                        last = rb_entry(rb_last(&ubi->free),
+                                        struct ubi_wl_entry, rb);
+                        if (last->ec - first->ec < WL_FREE_MAX_DIFF)
+                                e = rb_entry(ubi->free.rb_node,
+                                                struct ubi_wl_entry, rb);
+                        else {
+                                medium_ec = (first->ec + WL_FREE_MAX_DIFF)/2;
+                                e = find_wl_entry(&ubi->free, medium_ec);
+                        }
+                        protect = U_PROTECTION;
+                        break;
+                case UBI_SHORTTERM:
+                        /*
+                         * For short term data we pick a physical eraseblock
+                         * with the lowest erase counter as we expect it will
+                         * be erased soon.
+                         */
+                        e = rb_entry(rb_first(&ubi->free),
+                                     struct ubi_wl_entry, rb);
+                        protect = ST_PROTECTION;
+                        break;
+                default:
+                        protect = 0;
+                        e = NULL;
+                        BUG();
+        }
+        /*
+         * Move the physical eraseblock to the protection trees where it will
+         * be protected from being moved for some time.
+         */
+        free_tree_del(ubi, e);
+        prot_tree_add(ubi, e, pe, protect);
+        dbg_wl("PEB %d EC %d, protection %d", e->pnum, e->ec, protect);
+        spin_unlock(&ubi->wl_lock);
+        return e->pnum;
+}
+/**
+ * prot_tree_del - remove a physical eraseblock from the protection trees
+ * @ubi: UBI device description object
+ * @pnum: the physical eraseblock to remove
+ */
+static void prot_tree_del(struct ubi_device *ubi, int pnum)
+{
+        struct rb_node *p;
+        struct ubi_wl_prot_entry *pe = NULL;
+        p = ubi->prot.pnum.rb_node;
+        while (p) {
+                pe = rb_entry(p, struct ubi_wl_prot_entry, rb_pnum);
+                if (pnum == pe->e->pnum)
+                        break;
+                if (pnum < pe->e->pnum)
+                        p = p->rb_left;
+                else
+                        p = p->rb_right;
+        }
+        ubi_assert(pe->e->pnum == pnum);
+        rb_erase(&pe->rb_aec, &ubi->prot.aec);
+        rb_erase(&pe->rb_pnum, &ubi->prot.pnum);
+        kfree(pe);
+}
+/**
+ * sync_erase - synchronously erase a physical eraseblock.
+ * @ubi: UBI device description object
+ * @e: the the physical eraseblock to erase
+ * @torture: if the physical eraseblock has to be tortured
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int sync_erase(struct ubi_device *ubi, struct ubi_wl_entry *e, int torture)
+{
+        int err;
+        struct ubi_ec_hdr *ec_hdr;
+        unsigned long long ec = e->ec;
+        dbg_wl("erase PEB %d, old EC %llu", e->pnum, ec);
+        err = paranoid_check_ec(ubi, e->pnum, e->ec);
+        if (err > 0)
+                return -EINVAL;
+        ec_hdr = kzalloc(ubi->ec_hdr_alsize, GFP_KERNEL);
+        if (!ec_hdr)
+                return -ENOMEM;
+        err = ubi_io_sync_erase(ubi, e->pnum, torture);
+        if (err < 0)
+                goto out_free;
+        ec += err;
+        if (ec > UBI_MAX_ERASECOUNTER) {
+                /*
+                 * Erase counter overflow. Upgrade UBI and use 64-bit
+                 * erase counters internally.
+                 */
+                ubi_err("erase counter overflow at PEB %d, EC %llu",
+                        e->pnum, ec);
+                err = -EINVAL;
+                goto out_free;
+        }
+        dbg_wl("erased PEB %d, new EC %llu", e->pnum, ec);
+        ec_hdr->ec = cpu_to_ubi64(ec);
+        err = ubi_io_write_ec_hdr(ubi, e->pnum, ec_hdr);
+        if (err)
+                goto out_free;
+        e->ec = ec;
+        spin_lock(&ubi->wl_lock);
+        if (e->ec > ubi->max_ec)
+                ubi->max_ec = e->ec;
+        spin_unlock(&ubi->wl_lock);
+out_free:
+        kfree(ec_hdr);
+        return err;
+}
+/**
+ * check_protection_over - check if it is time to stop protecting some
+ * physical eraseblocks.
+ * @ubi: UBI device description object
+ *
+ * This function is called after each erase operation, when the absolute erase
+ * counter is incremented, to check if some physical eraseblock  have not to be
+ * protected any longer. These physical eraseblocks are moved from the
+ * protection trees to the used tree.
+ */
+static void check_protection_over(struct ubi_device *ubi)
+{
+        struct ubi_wl_prot_entry *pe;
+        /*
+         * There may be several protected physical eraseblock to remove,
+         * process them all.
+         */
+        while (1) {
+                spin_lock(&ubi->wl_lock);
+                if (tree_empty(&ubi->prot.aec)) {
+                        spin_unlock(&ubi->wl_lock);
+                        break;
+                }
+                pe = rb_entry(rb_first(&ubi->prot.aec),
+                              struct ubi_wl_prot_entry, rb_aec);
+                if (pe->abs_ec > ubi->abs_ec) {
+                        spin_unlock(&ubi->wl_lock);
+                        break;
+                }
+                dbg_wl("PEB %d protection over, abs_ec %llu, PEB abs_ec %llu",
+                       pe->e->pnum, ubi->abs_ec, pe->abs_ec);
+                rb_erase(&pe->rb_aec, &ubi->prot.aec);
+                rb_erase(&pe->rb_pnum, &ubi->prot.pnum);
+                used_tree_add(ubi, pe->e);
+                spin_unlock(&ubi->wl_lock);
+                kfree(pe);
+                cond_resched();
+        }
+}
+/**
+ * schedule_ubi_work - schedule a work.
+ * @ubi: UBI device description object
+ * @wrk: the work to schedule
+ *
+ * This function enqueues a work defined by @wrk to the tail of the pending
+ * works list.
+ */
+static void schedule_ubi_work(struct ubi_device *ubi, struct ubi_work *wrk)
+{
+        spin_lock(&ubi->wl_lock);
+        list_add_tail(&wrk->list, &ubi->works);
+        ubi_assert(ubi->works_count >= 0);
+        ubi->works_count += 1;
+        if (ubi->thread_enabled)
+                wake_up_process(ubi->bgt_thread);
+        spin_unlock(&ubi->wl_lock);
+}
+static int erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk,
+                        int cancel);
+/**
+ * schedule_erase - schedule an erase work.
+ * @ubi: UBI device description object
+ * @e: the WL entry of the physical eraseblock to erase
+ * @torture: if the physical eraseblock has to be tortured
+ *
+ * This function returns zero in case of success and a %-ENOMEM in case of
+ * failure.
+ */
+static int schedule_erase(struct ubi_device *ubi, struct ubi_wl_entry *e,
+                          int torture)
+{
+        struct ubi_work *wl_wrk;
+        dbg_wl("schedule erasure of PEB %d, EC %d, torture %d",
+               e->pnum, e->ec, torture);
+        wl_wrk = kmalloc(sizeof(struct ubi_work), GFP_KERNEL);
+        if (!wl_wrk)
+                return -ENOMEM;
+        wl_wrk->func = &erase_worker;
+        wl_wrk->e = e;
+        wl_wrk->torture = torture;
+        schedule_ubi_work(ubi, wl_wrk);
+        return 0;
+}
+/**
+ * wear_leveling_worker - wear-leveling worker function.
+ * @ubi: UBI device description object
+ * @wrk: the work object
+ * @cancel: non-zero if the worker has to free memory and exit
+ *
+ * This function copies a more worn out physical eraseblock to a less worn out
+ * one. Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk,
+                                int cancel)
+{
+        int err, put = 0;
+        struct ubi_wl_entry *e1, *e2;
+        struct ubi_vid_hdr *vid_hdr;
+        kfree(wrk);
+        if (cancel)
+                return 0;
+        vid_hdr = ubi_zalloc_vid_hdr(ubi);
+        if (!vid_hdr)
+                return -ENOMEM;
+        spin_lock(&ubi->wl_lock);
+        /*
+         * Only one WL worker at a time is supported at this implementation, so
+         * make sure a PEB is not being moved already.
+         */
+        if (ubi->move_to || tree_empty(&ubi->free) ||
+            (tree_empty(&ubi->used) && tree_empty(&ubi->scrub))) {
+                /*
+                 * Only one WL worker at a time is supported at this
+                 * implementation, so if a LEB is already being moved, cancel.
+                 *
+                 * No free physical eraseblocks? Well, we cancel wear-leveling
+                 * then. It will be triggered again when a free physical
+                 * eraseblock appears.
+                 *
+                 * No used physical eraseblocks? They must be temporarily
+                 * protected from being moved. They will be moved to the
+                 * @ubi->used tree later and the wear-leveling will be
+                 * triggered again.
+                 */
+                dbg_wl("cancel WL, a list is empty: free %d, used %d",
+                       tree_empty(&ubi->free), tree_empty(&ubi->used));
+                ubi->wl_scheduled = 0;
+                spin_unlock(&ubi->wl_lock);
+                ubi_free_vid_hdr(ubi, vid_hdr);
+                return 0;
+        }
+        if (tree_empty(&ubi->scrub)) {
+                /*
+                 * Now pick the least worn-out used physical eraseblock and a
+                 * highly worn-out free physical eraseblock. If the erase
+                 * counters differ much enough, start wear-leveling.
+                 */
+                e1 = rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, rb);
+                e2 = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);
+                if (!(e2->ec - e1->ec >= UBI_WL_THRESHOLD)) {
+                        dbg_wl("no WL needed: min used EC %d, max free EC %d",
+                               e1->ec, e2->ec);
+                        ubi->wl_scheduled = 0;
+                        spin_unlock(&ubi->wl_lock);
+                        ubi_free_vid_hdr(ubi, vid_hdr);
+                        return 0;
+                }
+                used_tree_del(ubi, e1);
+                dbg_wl("move PEB %d EC %d to PEB %d EC %d",
+                       e1->pnum, e1->ec, e2->pnum, e2->ec);
+        } else {
+                e1 = rb_entry(rb_first(&ubi->scrub), struct ubi_wl_entry, rb);
+                e2 = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);
+                scrub_tree_del(ubi, e1);
+                dbg_wl("scrub PEB %d to PEB %d", e1->pnum, e2->pnum);
+        }
+        free_tree_del(ubi, e2);
+        ubi_assert(!ubi->move_from && !ubi->move_to);
+        ubi_assert(!ubi->move_to_put && !ubi->move_from_put);
+        ubi->move_from = e1;
+        ubi->move_to = e2;
+        spin_unlock(&ubi->wl_lock);
+        /*
+         * Now we are going to copy physical eraseblock @e1->pnum to @e2->pnum.
+         * We so far do not know which logical eraseblock our physical
+         * eraseblock (@e1) belongs to. We have to read the volume identifier
+         * header first.
+         */
+        err = ubi_io_read_vid_hdr(ubi, e1->pnum, vid_hdr, 0);
+        if (err && err != UBI_IO_BITFLIPS) {
+                if (err == UBI_IO_PEB_FREE) {
+                        /*
+                         * We are trying to move PEB without a VID header. UBI
+                         * always write VID headers shortly after the PEB was
+                         * given, so we have a situation when it did not have
+                         * chance to write it down because it was preempted.
+                         * Just re-schedule the work, so that next time it will
+                         * likely have the VID header in place.
+                         */
+                        dbg_wl("PEB %d has no VID header", e1->pnum);
+                        err = 0;
+                } else {
+                        ubi_err("error %d while reading VID header from PEB %d",
+                                err, e1->pnum);
+                        if (err > 0)
+                                err = -EIO;
+                }
+                goto error;
+        }
+        err = ubi_eba_copy_leb(ubi, e1->pnum, e2->pnum, vid_hdr);
+        if (err) {
+                if (err == UBI_IO_BITFLIPS)
+                        err = 0;
+                goto error;
+        }
+        ubi_free_vid_hdr(ubi, vid_hdr);
+        spin_lock(&ubi->wl_lock);
+        if (!ubi->move_to_put)
+                used_tree_add(ubi, e2);
+        else
+                put = 1;
+        ubi->move_from = ubi->move_to = NULL;
+        ubi->move_from_put = ubi->move_to_put = 0;
+        ubi->wl_scheduled = 0;
+        spin_unlock(&ubi->wl_lock);
+        if (put) {
+                /*
+                 * Well, the target PEB was put meanwhile, schedule it for
+                 * erasure.
+                 */
+                dbg_wl("PEB %d was put meanwhile, erase", e2->pnum);
+                err = schedule_erase(ubi, e2, 0);
+                if (err) {
+                        kmem_cache_free(wl_entries_slab, e2);
+                        ubi_ro_mode(ubi);
+                }
+        }
+        err = schedule_erase(ubi, e1, 0);
+        if (err) {
+                kmem_cache_free(wl_entries_slab, e1);
+                ubi_ro_mode(ubi);
+        }
+        dbg_wl("done");
+        return err;
+        /*
+         * Some error occurred. @e1 was not changed, so return it back. @e2
+         * might be changed, schedule it for erasure.
+         */
+error:
+        if (err)
+                dbg_wl("error %d occurred, cancel operation", err);
+        ubi_assert(err <= 0);
+        ubi_free_vid_hdr(ubi, vid_hdr);
+        spin_lock(&ubi->wl_lock);
+        ubi->wl_scheduled = 0;
+        if (ubi->move_from_put)
+                put = 1;
+        else
+                used_tree_add(ubi, e1);
+        ubi->move_from = ubi->move_to = NULL;
+        ubi->move_from_put = ubi->move_to_put = 0;
+        spin_unlock(&ubi->wl_lock);
+        if (put) {
+                /*
+                 * Well, the target PEB was put meanwhile, schedule it for
+                 * erasure.
+                 */
+                dbg_wl("PEB %d was put meanwhile, erase", e1->pnum);
+                err = schedule_erase(ubi, e1, 0);
+                if (err) {
+                        kmem_cache_free(wl_entries_slab, e1);
+                        ubi_ro_mode(ubi);
+                }
+        }
+        err = schedule_erase(ubi, e2, 0);
+        if (err) {
+                kmem_cache_free(wl_entries_slab, e2);
+                ubi_ro_mode(ubi);
+        }
+        yield();
+        return err;
+}
+/**
+ * ensure_wear_leveling - schedule wear-leveling if it is needed.
+ * @ubi: UBI device description object
+ *
+ * This function checks if it is time to start wear-leveling and schedules it
+ * if yes. This function returns zero in case of success and a negative error
+ * code in case of failure.
+ */
+static int ensure_wear_leveling(struct ubi_device *ubi)
+{
+        int err = 0;
+        struct ubi_wl_entry *e1;
+        struct ubi_wl_entry *e2;
+        struct ubi_work *wrk;
+        spin_lock(&ubi->wl_lock);
+        if (ubi->wl_scheduled)
+                /* Wear-leveling is already in the work queue */
+                goto out_unlock;
+        /*
+         * If the ubi->scrub tree is not empty, scrubbing is needed, and the
+         * the WL worker has to be scheduled anyway.
+         */
+        if (tree_empty(&ubi->scrub)) {
+                if (tree_empty(&ubi->used) || tree_empty(&ubi->free))
+                        /* No physical eraseblocks - no deal */
+                        goto out_unlock;
+                /*
+                 * We schedule wear-leveling only if the difference between the
+                 * lowest erase counter of used physical eraseblocks and a high
+                 * erase counter of free physical eraseblocks is greater then
+                 * %UBI_WL_THRESHOLD.
+                 */
+                e1 = rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, rb);
+                e2 = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);
+                if (!(e2->ec - e1->ec >= UBI_WL_THRESHOLD))
+                        goto out_unlock;
+                dbg_wl("schedule wear-leveling");
+        } else
+                dbg_wl("schedule scrubbing");
+        ubi->wl_scheduled = 1;
+        spin_unlock(&ubi->wl_lock);
+        wrk = kmalloc(sizeof(struct ubi_work), GFP_KERNEL);
+        if (!wrk) {
+                err = -ENOMEM;
+                goto out_cancel;
+        }
+        wrk->func = &wear_leveling_worker;
+        schedule_ubi_work(ubi, wrk);
+        return err;
+out_cancel:
+        spin_lock(&ubi->wl_lock);
+        ubi->wl_scheduled = 0;
+out_unlock:
+        spin_unlock(&ubi->wl_lock);
+        return err;
+}
+/**
+ * erase_worker - physical eraseblock erase worker function.
+ * @ubi: UBI device description object
+ * @wl_wrk: the work object
+ * @cancel: non-zero if the worker has to free memory and exit
+ *
+ * This function erases a physical eraseblock and perform torture testing if
+ * needed. It also takes care about marking the physical eraseblock bad if
+ * needed. Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static int erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk,
+                        int cancel)
+{
+        int err;
+        struct ubi_wl_entry *e = wl_wrk->e;
+        int pnum = e->pnum;
+        if (cancel) {
+                dbg_wl("cancel erasure of PEB %d EC %d", pnum, e->ec);
+                kfree(wl_wrk);
+                kmem_cache_free(wl_entries_slab, e);
+                return 0;
+        }
+        dbg_wl("erase PEB %d EC %d", pnum, e->ec);
+        err = sync_erase(ubi, e, wl_wrk->torture);
+        if (!err) {
+                /* Fine, we've erased it successfully */
+                kfree(wl_wrk);
+                spin_lock(&ubi->wl_lock);
+                ubi->abs_ec += 1;
+                free_tree_add(ubi, e);
+                spin_unlock(&ubi->wl_lock);
+                /*
+                 * One more erase operation has happened, take care about protected
+                 * physical eraseblocks.
+                 */
+                check_protection_over(ubi);
+                /* And take care about wear-leveling */
+                err = ensure_wear_leveling(ubi);
+                return err;
+        }
+        kfree(wl_wrk);
+        kmem_cache_free(wl_entries_slab, e);
+        if (err != -EIO) {
+                /*
+                 * If this is not %-EIO, we have no idea what to do. Scheduling
+                 * this physical eraseblock for erasure again would cause
+                 * errors again and again. Well, lets switch to RO mode.
+                 */
+                ubi_ro_mode(ubi);
+                return err;
+        }
+        /* It is %-EIO, the PEB went bad */
+        if (!ubi->bad_allowed) {
+                ubi_err("bad physical eraseblock %d detected", pnum);
+                ubi_ro_mode(ubi);
+                err = -EIO;
+        } else {
+                int need;
+                spin_lock(&ubi->volumes_lock);
+                need = ubi->beb_rsvd_level - ubi->beb_rsvd_pebs + 1;
+                if (need > 0) {
+                        need = ubi->avail_pebs >= need ? need : ubi->avail_pebs;
+                        ubi->avail_pebs -= need;
+                        ubi->rsvd_pebs += need;
+                        ubi->beb_rsvd_pebs += need;
+                        if (need > 0)
+                                ubi_msg("reserve more %d PEBs", need);
+                }
+                if (ubi->beb_rsvd_pebs == 0) {
+                        spin_unlock(&ubi->volumes_lock);
+                        ubi_err("no reserved physical eraseblocks");
+                        ubi_ro_mode(ubi);
+                        return -EIO;
+                }
+                spin_unlock(&ubi->volumes_lock);
+                ubi_msg("mark PEB %d as bad", pnum);
+                err = ubi_io_mark_bad(ubi, pnum);
+                if (err) {
+                        ubi_ro_mode(ubi);
+                        return err;
+                }
+                spin_lock(&ubi->volumes_lock);
+                ubi->beb_rsvd_pebs -= 1;
+                ubi->bad_peb_count += 1;
+                ubi->good_peb_count -= 1;
+                ubi_calculate_reserved(ubi);
+                if (ubi->beb_rsvd_pebs == 0)
+                        ubi_warn("last PEB from the reserved pool was used");
+                spin_unlock(&ubi->volumes_lock);
+        }
+        return err;
+}
+/**
+ * ubi_wl_put_peb - return a physical eraseblock to the wear-leveling
+ * unit.
+ * @ubi: UBI device description object
+ * @pnum: physical eraseblock to return
+ * @torture: if this physical eraseblock has to be tortured
+ *
+ * This function is called to return physical eraseblock @pnum to the pool of
+ * free physical eraseblocks. The @torture flag has to be set if an I/O error
+ * occurred to this @pnum and it has to be tested. This function returns zero
+ * in case of success and a negative error code in case of failure.
+ */
+int ubi_wl_put_peb(struct ubi_device *ubi, int pnum, int torture)
+{
+        int err;
+        struct ubi_wl_entry *e;
+        dbg_wl("PEB %d", pnum);
+        ubi_assert(pnum >= 0);
+        ubi_assert(pnum < ubi->peb_count);
+        spin_lock(&ubi->wl_lock);
+        e = ubi->lookuptbl[pnum];
+        if (e == ubi->move_from) {
+                /*
+                 * User is putting the physical eraseblock which was selected to
+                 * be moved. It will be scheduled for erasure in the
+                 * wear-leveling worker.
+                 */
+                dbg_wl("PEB %d is being moved", pnum);
+                ubi_assert(!ubi->move_from_put);
+                ubi->move_from_put = 1;
+                spin_unlock(&ubi->wl_lock);
+                return 0;
+        } else if (e == ubi->move_to) {
+                /*
+                 * User is putting the physical eraseblock which was selected
+                 * as the target the data is moved to. It may happen if the EBA
+                 * unit already re-mapped the LEB but the WL unit did has not
+                 * put the PEB to the "used" tree.
+                 */
+                dbg_wl("PEB %d is the target of data moving", pnum);
+                ubi_assert(!ubi->move_to_put);
+                ubi->move_to_put = 1;
+                spin_unlock(&ubi->wl_lock);
+                return 0;
+        } else {
+                if (in_wl_tree(e, &ubi->used))
+                        used_tree_del(ubi, e);
+                else if (in_wl_tree(e, &ubi->scrub))
+                        scrub_tree_del(ubi, e);
+                else
+                        prot_tree_del(ubi, e->pnum);
+        }
+        spin_unlock(&ubi->wl_lock);
+        err = schedule_erase(ubi, e, torture);
+        if (err) {
+                spin_lock(&ubi->wl_lock);
+                used_tree_add(ubi, e);
+                spin_unlock(&ubi->wl_lock);
+        }
+        return err;
+}
+/**
+ * ubi_wl_scrub_peb - schedule a physical eraseblock for scrubbing.
+ * @ubi: UBI device description object
+ * @pnum: the physical eraseblock to schedule
+ *
+ * If a bit-flip in a physical eraseblock is detected, this physical eraseblock
+ * needs scrubbing. This function schedules a physical eraseblock for
+ * scrubbing which is done in background. This function returns zero in case of
+ * success and a negative error code in case of failure.
+ */
+int ubi_wl_scrub_peb(struct ubi_device *ubi, int pnum)
+{
+        struct ubi_wl_entry *e;
+        ubi_msg("schedule PEB %d for scrubbing", pnum);
+retry:
+        spin_lock(&ubi->wl_lock);
+        e = ubi->lookuptbl[pnum];
+        if (e == ubi->move_from || in_wl_tree(e, &ubi->scrub)) {
+                spin_unlock(&ubi->wl_lock);
+                return 0;
+        }
+        if (e == ubi->move_to) {
+                /*
+                 * This physical eraseblock was used to move data to. The data
+                 * was moved but the PEB was not yet inserted to the proper
+                 * tree. We should just wait a little and let the WL worker
+                 * proceed.
+                 */
+                spin_unlock(&ubi->wl_lock);
+                dbg_wl("the PEB %d is not in proper tree, retry", pnum);
+                yield();
+                goto retry;
+        }
+        if (in_wl_tree(e, &ubi->used))
+                used_tree_del(ubi, e);
+        else
+                prot_tree_del(ubi, pnum);
+        scrub_tree_add(ubi, e);
+        spin_unlock(&ubi->wl_lock);
+        /*
+         * Technically scrubbing is the same as wear-leveling, so it is done
+         * by the WL worker.
+         */
+        return ensure_wear_leveling(ubi);
+}
+/**
+ * ubi_wl_flush - flush all pending works.
+ * @ubi: UBI device description object
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+int ubi_wl_flush(struct ubi_device *ubi)
+{
+        int err, pending_count;
+        pending_count = ubi->works_count;
+        dbg_wl("flush (%d pending works)", pending_count);
+        /*
+         * Erase while the pending works queue is not empty, but not more then
+         * the number of currently pending works.
+         */
+        while (pending_count-- > 0) {
+                err = do_work(ubi);
+                if (err)
+                        return err;
+        }
+        return 0;
+}
+/**
+ * tree_destroy - destroy an RB-tree.
+ * @root: the root of the tree to destroy
+ */
+static void tree_destroy(struct rb_root *root)
+{
+        struct rb_node *rb;
+        struct ubi_wl_entry *e;
+        rb = root->rb_node;
+        while (rb) {
+                if (rb->rb_left)
+                        rb = rb->rb_left;
+                else if (rb->rb_right)
+                        rb = rb->rb_right;
+                else {
+                        e = rb_entry(rb, struct ubi_wl_entry, rb);
+                        rb = rb_parent(rb);
+                        if (rb) {
+                                if (rb->rb_left == &e->rb)
+                                        rb->rb_left = NULL;
+                                else
+                                        rb->rb_right = NULL;
+                        }
+                        kmem_cache_free(wl_entries_slab, e);
+                }
+        }
+}
+/**
+ * ubi_thread - UBI background thread.
+ * @u: the UBI device description object pointer
+ */
+static int ubi_thread(void *u)
+{
+        int failures = 0;
+        struct ubi_device *ubi = u;
+        ubi_msg("background thread \"%s\" started, PID %d",
+                ubi->bgt_name, current->pid);
+        for (;;) {
+                int err;
+                if (kthread_should_stop())
+                        goto out;
+                if (try_to_freeze())
+                        continue;
+                spin_lock(&ubi->wl_lock);
+                if (list_empty(&ubi->works) || ubi->ro_mode ||
+                               !ubi->thread_enabled) {
+                        set_current_state(TASK_INTERRUPTIBLE);
+                        spin_unlock(&ubi->wl_lock);
+                        schedule();
+                        continue;
+                }
+                spin_unlock(&ubi->wl_lock);
+                err = do_work(ubi);
+                if (err) {
+                        ubi_err("%s: work failed with error code %d",
+                                ubi->bgt_name, err);
+                        if (failures++ > WL_MAX_FAILURES) {
+                                /*
+                                 * Too many failures, disable the thread and
+                                 * switch to read-only mode.
+                                 */
+                                ubi_msg("%s: %d consecutive failures",
+                                        ubi->bgt_name, WL_MAX_FAILURES);
+                                ubi_ro_mode(ubi);
+                                break;
+                        }
+                } else
+                        failures = 0;
+                cond_resched();
+        }
+out:
+        dbg_wl("background thread \"%s\" is killed", ubi->bgt_name);
+        return 0;
+}
+/**
+ * cancel_pending - cancel all pending works.
+ * @ubi: UBI device description object
+ */
+static void cancel_pending(struct ubi_device *ubi)
+{
+        while (!list_empty(&ubi->works)) {
+                struct ubi_work *wrk;
+                wrk = list_entry(ubi->works.next, struct ubi_work, list);
+                list_del(&wrk->list);
+                wrk->func(ubi, wrk, 1);
+                ubi->works_count -= 1;
+                ubi_assert(ubi->works_count >= 0);
+        }
+}
+/**
+ * ubi_wl_init_scan - initialize the wear-leveling unit using scanning
+ * information.
+ * @ubi: UBI device description object
+ * @si: scanning information
+ *
+ * This function returns zero in case of success, and a negative error code in
+ * case of failure.
+ */
+int ubi_wl_init_scan(struct ubi_device *ubi, struct ubi_scan_info *si)
+{
+        int err;
+        struct rb_node *rb1, *rb2;
+        struct ubi_scan_volume *sv;
+        struct ubi_scan_leb *seb, *tmp;
+        struct ubi_wl_entry *e;
+        ubi->used = ubi->free = ubi->scrub = RB_ROOT;
+        ubi->prot.pnum = ubi->prot.aec = RB_ROOT;
+        spin_lock_init(&ubi->wl_lock);
+        ubi->max_ec = si->max_ec;
+        INIT_LIST_HEAD(&ubi->works);
+        sprintf(ubi->bgt_name, UBI_BGT_NAME_PATTERN, ubi->ubi_num);
+        ubi->bgt_thread = kthread_create(ubi_thread, ubi, ubi->bgt_name);
+        if (IS_ERR(ubi->bgt_thread)) {
+                err = PTR_ERR(ubi->bgt_thread);
+                ubi_err("cannot spawn \"%s\", error %d", ubi->bgt_name,
+                        err);
+                return err;
+        }
+        if (ubi_devices_cnt == 0) {
+                wl_entries_slab = kmem_cache_create("ubi_wl_entry_slab",
+                                                    sizeof(struct ubi_wl_entry),
+                                                    0, 0, NULL, NULL);
+                if (!wl_entries_slab)
+                        return -ENOMEM;
+        }
+        err = -ENOMEM;
+        ubi->lookuptbl = kzalloc(ubi->peb_count * sizeof(void *), GFP_KERNEL);
+        if (!ubi->lookuptbl)
+                goto out_free;
+        list_for_each_entry_safe(seb, tmp, &si->erase, u.list) {
+                cond_resched();
+                e = kmem_cache_alloc(wl_entries_slab, GFP_KERNEL);
+                if (!e)
+                        goto out_free;
+                e->pnum = seb->pnum;
+                e->ec = seb->ec;
+                ubi->lookuptbl[e->pnum] = e;
+                if (schedule_erase(ubi, e, 0)) {
+                        kmem_cache_free(wl_entries_slab, e);
+                        goto out_free;
+                }
+        }
+        list_for_each_entry(seb, &si->free, u.list) {
+                cond_resched();
+                e = kmem_cache_alloc(wl_entries_slab, GFP_KERNEL);
+                if (!e)
+                        goto out_free;
+                e->pnum = seb->pnum;
+                e->ec = seb->ec;
+                ubi_assert(e->ec >= 0);
+                free_tree_add(ubi, e);
+                ubi->lookuptbl[e->pnum] = e;
+        }
+        list_for_each_entry(seb, &si->corr, u.list) {
+                cond_resched();
+                e = kmem_cache_alloc(wl_entries_slab, GFP_KERNEL);
+                if (!e)
+                        goto out_free;
+                e->pnum = seb->pnum;
+                e->ec = seb->ec;
+                ubi->lookuptbl[e->pnum] = e;
+                if (schedule_erase(ubi, e, 0)) {
+                        kmem_cache_free(wl_entries_slab, e);
+                        goto out_free;
+                }
+        }
+        ubi_rb_for_each_entry(rb1, sv, &si->volumes, rb) {
+                ubi_rb_for_each_entry(rb2, seb, &sv->root, u.rb) {
+                        cond_resched();
+                        e = kmem_cache_alloc(wl_entries_slab, GFP_KERNEL);
+                        if (!e)
+                                goto out_free;
+                        e->pnum = seb->pnum;
+                        e->ec = seb->ec;
+                        ubi->lookuptbl[e->pnum] = e;
+                        if (!seb->scrub) {
+                                dbg_wl("add PEB %d EC %d to the used tree",
+                                       e->pnum, e->ec);
+                                used_tree_add(ubi, e);
+                        } else {
+                                dbg_wl("add PEB %d EC %d to the scrub tree",
+                                       e->pnum, e->ec);
+                                scrub_tree_add(ubi, e);
+                        }
+                }
+        }
+        if (WL_RESERVED_PEBS > ubi->avail_pebs) {
+                ubi_err("no enough physical eraseblocks (%d, need %d)",
+                        ubi->avail_pebs, WL_RESERVED_PEBS);
+                goto out_free;
+        }
+        ubi->avail_pebs -= WL_RESERVED_PEBS;
+        ubi->rsvd_pebs += WL_RESERVED_PEBS;
+        /* Schedule wear-leveling if needed */
+        err = ensure_wear_leveling(ubi);
+        if (err)
+                goto out_free;
+        return 0;
+out_free:
+        cancel_pending(ubi);
+        tree_destroy(&ubi->used);
+        tree_destroy(&ubi->free);
+        tree_destroy(&ubi->scrub);
+        kfree(ubi->lookuptbl);
+        if (ubi_devices_cnt == 0)
+                kmem_cache_destroy(wl_entries_slab);
+        return err;
+}
+/**
+ * protection_trees_destroy - destroy the protection RB-trees.
+ * @ubi: UBI device description object
+ */
+static void protection_trees_destroy(struct ubi_device *ubi)
+{
+        struct rb_node *rb;
+        struct ubi_wl_prot_entry *pe;
+        rb = ubi->prot.aec.rb_node;
+        while (rb) {
+                if (rb->rb_left)
+                        rb = rb->rb_left;
+                else if (rb->rb_right)
+                        rb = rb->rb_right;
+                else {
+                        pe = rb_entry(rb, struct ubi_wl_prot_entry, rb_aec);
+                        rb = rb_parent(rb);
+                        if (rb) {
+                                if (rb->rb_left == &pe->rb_aec)
+                                        rb->rb_left = NULL;
+                                else
+                                        rb->rb_right = NULL;
+                        }
+                        kmem_cache_free(wl_entries_slab, pe->e);
+                        kfree(pe);
+                }
+        }
+}
+/**
+ * ubi_wl_close - close the wear-leveling unit.
+ * @ubi: UBI device description object
+ */
+void ubi_wl_close(struct ubi_device *ubi)
+{
+        dbg_wl("disable \"%s\"", ubi->bgt_name);
+        if (ubi->bgt_thread)
+                kthread_stop(ubi->bgt_thread);
+        dbg_wl("close the UBI wear-leveling unit");
+        cancel_pending(ubi);
+        protection_trees_destroy(ubi);
+        tree_destroy(&ubi->used);
+        tree_destroy(&ubi->free);
+        tree_destroy(&ubi->scrub);
+        kfree(ubi->lookuptbl);
+        if (ubi_devices_cnt == 1)
+                kmem_cache_destroy(wl_entries_slab);
+}
+#ifdef CONFIG_MTD_UBI_DEBUG_PARANOID
+/**
+ * paranoid_check_ec - make sure that the erase counter of a physical eraseblock
+ * is correct.
+ * @ubi: UBI device description object
+ * @pnum: the physical eraseblock number to check
+ * @ec: the erase counter to check
+ *
+ * This function returns zero if the erase counter of physical eraseblock @pnum
+ * is equivalent to @ec, %1 if not, and a negative error code if an error
+ * occurred.
+ */
+static int paranoid_check_ec(const struct ubi_device *ubi, int pnum, int ec)
+{
+        int err;
+        long long read_ec;
+        struct ubi_ec_hdr *ec_hdr;
+        ec_hdr = kzalloc(ubi->ec_hdr_alsize, GFP_KERNEL);
+        if (!ec_hdr)
+                return -ENOMEM;
+        err = ubi_io_read_ec_hdr(ubi, pnum, ec_hdr, 0);
+        if (err && err != UBI_IO_BITFLIPS) {
+                /* The header does not have to exist */
+                err = 0;
+                goto out_free;
+        }
+        read_ec = ubi64_to_cpu(ec_hdr->ec);
+        if (ec != read_ec) {
+                ubi_err("paranoid check failed for PEB %d", pnum);
+                ubi_err("read EC is %lld, should be %d", read_ec, ec);
+                ubi_dbg_dump_stack();
+                err = 1;
+        } else
+                err = 0;
+out_free:
+        kfree(ec_hdr);
+        return err;
+}
+/**
+ * paranoid_check_in_wl_tree - make sure that a wear-leveling entry is present
+ * in a WL RB-tree.
+ * @e: the wear-leveling entry to check
+ * @root: the root of the tree
+ *
+ * This function returns zero if @e is in the @root RB-tree and %1 if it
+ * is not.
+ */
+static int paranoid_check_in_wl_tree(struct ubi_wl_entry *e,
+                                     struct rb_root *root)
+{
+        if (in_wl_tree(e, root))
+                return 0;
+        ubi_err("paranoid check failed for PEB %d, EC %d, RB-tree %p ",
+                e->pnum, e->ec, root);
+        ubi_dbg_dump_stack();
+        return 1;
+}
+#endif /* CONFIG_MTD_UBI_DEBUG_PARANOID */
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index abb90c0c09cc..8a649f602767 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -672,6 +672,13 @@ static int jffs2_flash_setup(struct jffs2_sb_info *c) {
                        return ret;
        }
+        /* and an UBI volume */
+        if (jffs2_ubivol(c)) {
+                ret = jffs2_ubivol_setup(c);
+                if (ret)
+                        return ret;
+        }
        return ret;
 }
@@ -690,4 +697,9 @@ void jffs2_flash_cleanup(struct jffs2_sb_info *c) {
        if (jffs2_nor_wbuf_flash(c)) {
                jffs2_nor_wbuf_flash_cleanup(c);
        }
+        /* and an UBI volume */
+        if (jffs2_ubivol(c)) {
+                jffs2_ubivol_cleanup(c);
+        }
 }
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index e07a0edcdb4f..8d92e45168ca 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -98,6 +98,9 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
 #define jffs2_nor_wbuf_flash(c) (0)
 #define jffs2_nor_wbuf_flash_setup(c) (0)
 #define jffs2_nor_wbuf_flash_cleanup(c) do {} while (0)
+#define jffs2_ubivol(c) (0)
+#define jffs2_ubivol_setup(c) (0)
+#define jffs2_ubivol_cleanup(c) do {} while (0)
 #else /* NAND and/or ECC'd NOR support present */
@@ -133,6 +136,9 @@ void jffs2_nand_flash_cleanup(struct jffs2_sb_info *c);
 #define jffs2_dataflash(c) (c->mtd->type == MTD_DATAFLASH)
 int jffs2_dataflash_setup(struct jffs2_sb_info *c);
 void jffs2_dataflash_cleanup(struct jffs2_sb_info *c);
+#define jffs2_ubivol(c) (c->mtd->type == MTD_UBIVOLUME)
+int jffs2_ubivol_setup(struct jffs2_sb_info *c);
+void jffs2_ubivol_cleanup(struct jffs2_sb_info *c);
 #define jffs2_nor_wbuf_flash(c) (c->mtd->type == MTD_NORFLASH && ! (c->mtd->flags & MTD_BIT_WRITEABLE))
 int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c);
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 4fac6dd53954..ab86031b3c07 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1208,3 +1208,27 @@ int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) {
 void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c) {
        kfree(c->wbuf);
 }
+int jffs2_ubivol_setup(struct jffs2_sb_info *c) {
+        c->cleanmarker_size = 0;
+        if (c->mtd->writesize == 1)
+                /* We do not need write-buffer */
+                return 0;
+        init_rwsem(&c->wbuf_sem);
+        c->wbuf_pagesize =  c->mtd->writesize;
+        c->wbuf_ofs = 0xFFFFFFFF;
+        c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
+        if (!c->wbuf)
+                return -ENOMEM;
+        printk(KERN_INFO "JFFS2 write-buffering enabled buffer (%d) erasesize (%d)\n", c->wbuf_pagesize, c->sector_size);
+        return 0;
+}
+void jffs2_ubivol_cleanup(struct jffs2_sb_info *c) {
+        kfree(c->wbuf);
+}
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index f27e5378caf2..a0c8667caa72 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -27,6 +27,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
+#include <linux/swap.h>
 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
@@ -34,6 +35,7 @@
 #include "ocfs2.h"
 #include "alloc.h"
+#include "aops.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "inode.h"
@@ -47,63 +49,243 @@
 #include "buffer_head_io.h"
-static int ocfs2_extent_contig(struct inode *inode,
+static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
-                               struct ocfs2_extent_rec *ext,
-                               u64 blkno);
-static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
+/*
-                                     handle_t *handle,
+ * Structures which describe a path through a btree, and functions to
-                                     struct inode *inode,
+ * manipulate them.
-                                     int wanted,
+ *
-                                     struct ocfs2_alloc_context *meta_ac,
+ * The idea here is to be as generic as possible with the tree
-                                     struct buffer_head *bhs[]);
+ * manipulation code.
+ */
+struct ocfs2_path_item {
+        struct buffer_head              *bh;
+        struct ocfs2_extent_list        *el;
+};
-static int ocfs2_add_branch(struct ocfs2_super *osb,
+#define OCFS2_MAX_PATH_DEPTH    5
-                            handle_t *handle,
-                            struct inode *inode,
-                            struct buffer_head *fe_bh,
-                            struct buffer_head *eb_bh,
-                            struct buffer_head *last_eb_bh,
-                            struct ocfs2_alloc_context *meta_ac);
-static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
+struct ocfs2_path {
-                                  handle_t *handle,
+        int                     p_tree_depth;
-                                  struct inode *inode,
+        struct ocfs2_path_item  p_node[OCFS2_MAX_PATH_DEPTH];
-                                  struct buffer_head *fe_bh,
+};
-                                  struct ocfs2_alloc_context *meta_ac,
-                                  struct buffer_head **ret_new_eb_bh);
-static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
+#define path_root_bh(_path) ((_path)->p_node[0].bh)
-                                  handle_t *handle,
+#define path_root_el(_path) ((_path)->p_node[0].el)
-                                  struct inode *inode,
+#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
-                                  struct buffer_head *fe_bh,
+#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
-                                  u64 blkno,
+#define path_num_items(_path) ((_path)->p_tree_depth + 1)
-                                  u32 new_clusters);
-static int ocfs2_find_branch_target(struct ocfs2_super *osb,
+/*
-                                    struct inode *inode,
+ * Reset the actual path elements so that we can re-use the structure
-                                    struct buffer_head *fe_bh,
+ * to build another path. Generally, this involves freeing the buffer
-                                    struct buffer_head **target_bh);
+ * heads.
+ */
+static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
+{
+        int i, start = 0, depth = 0;
+        struct ocfs2_path_item *node;
-static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
+        if (keep_root)
-                                       struct inode *inode,
+                start = 1;
-                                       struct ocfs2_dinode *fe,
-                                       unsigned int new_i_clusters,
+        for(i = start; i < path_num_items(path); i++) {
-                                       struct buffer_head *old_last_eb,
+                node = &path->p_node[i];
-                                       struct buffer_head **new_last_eb);
+                brelse(node->bh);
+                node->bh = NULL;
+                node->el = NULL;
+        }
+        /*
+         * Tree depth may change during truncate, or insert. If we're
+         * keeping the root extent list, then make sure that our path
+         * structure reflects the proper depth.
+         */
+        if (keep_root)
+                depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
+        path->p_tree_depth = depth;
+}
+static void ocfs2_free_path(struct ocfs2_path *path)
+{
+        if (path) {
+                ocfs2_reinit_path(path, 0);
+                kfree(path);
+        }
+}
+/*
+ * Make the *dest path the same as src and re-initialize src path to
+ * have a root only.
+ */
+static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
+{
+        int i;
+        BUG_ON(path_root_bh(dest) != path_root_bh(src));
+        for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
+                brelse(dest->p_node[i].bh);
+                dest->p_node[i].bh = src->p_node[i].bh;
+                dest->p_node[i].el = src->p_node[i].el;
+                src->p_node[i].bh = NULL;
+                src->p_node[i].el = NULL;
+        }
+}
+/*
+ * Insert an extent block at given index.
+ *
+ * This will not take an additional reference on eb_bh.
+ */
+static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
+                                        struct buffer_head *eb_bh)
+{
+        struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
+        /*
+         * Right now, no root bh is an extent block, so this helps
+         * catch code errors with dinode trees. The assertion can be
+         * safely removed if we ever need to insert extent block
+         * structures at the root.
+         */
+        BUG_ON(index == 0);
+        path->p_node[index].bh = eb_bh;
+        path->p_node[index].el = &eb->h_list;
+}
+static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
+                                         struct ocfs2_extent_list *root_el)
+{
+        struct ocfs2_path *path;
+        BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
+        path = kzalloc(sizeof(*path), GFP_NOFS);
+        if (path) {
+                path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
+                get_bh(root_bh);
+                path_root_bh(path) = root_bh;
+                path_root_el(path) = root_el;
+        }
+        return path;
+}
+/*
+ * Allocate and initialize a new path based on a disk inode tree.
+ */
+static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh)
+{
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        struct ocfs2_extent_list *el = &di->id2.i_list;
+        return ocfs2_new_path(di_bh, el);
+}
+/*
+ * Convenience function to journal all components in a path.
+ */
+static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
+                                     struct ocfs2_path *path)
+{
+        int i, ret = 0;
+        if (!path)
+                goto out;
+        for(i = 0; i < path_num_items(path); i++) {
+                ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+out:
+        return ret;
+}
+enum ocfs2_contig_type {
+        CONTIG_NONE = 0,
+        CONTIG_LEFT,
+        CONTIG_RIGHT
+};
-static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
-static int ocfs2_extent_contig(struct inode *inode,
+/*
-                               struct ocfs2_extent_rec *ext,
+ * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
-                               u64 blkno)
+ * ocfs2_extent_contig only work properly against leaf nodes!
+ */
+static int ocfs2_block_extent_contig(struct super_block *sb,
+                                     struct ocfs2_extent_rec *ext,
+                                     u64 blkno)
+{
+        u64 blk_end = le64_to_cpu(ext->e_blkno);
+        blk_end += ocfs2_clusters_to_blocks(sb,
+                                    le16_to_cpu(ext->e_leaf_clusters));
+        return blkno == blk_end;
+}
+static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
+                                  struct ocfs2_extent_rec *right)
+{
+        u32 left_range;
+        left_range = le32_to_cpu(left->e_cpos) +
+                le16_to_cpu(left->e_leaf_clusters);
+        return (left_range == le32_to_cpu(right->e_cpos));
+}
+static enum ocfs2_contig_type
+        ocfs2_extent_contig(struct inode *inode,
+                            struct ocfs2_extent_rec *ext,
+                            struct ocfs2_extent_rec *insert_rec)
 {
-        return blkno == (le64_to_cpu(ext->e_blkno) +
+        u64 blkno = le64_to_cpu(insert_rec->e_blkno);
-                         ocfs2_clusters_to_blocks(inode->i_sb,
-                                                  le32_to_cpu(ext->e_clusters)));
+        if (ocfs2_extents_adjacent(ext, insert_rec) &&
+            ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
+                        return CONTIG_RIGHT;
+        blkno = le64_to_cpu(ext->e_blkno);
+        if (ocfs2_extents_adjacent(insert_rec, ext) &&
+            ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
+                return CONTIG_LEFT;
+        return CONTIG_NONE;
 }
 /*
+ * NOTE: We can have pretty much any combination of contiguousness and
+ * appending.
+ *
+ * The usefulness of APPEND_TAIL is more in that it lets us know that
+ * we'll have to update the path to that leaf.
+ */
+enum ocfs2_append_type {
+        APPEND_NONE = 0,
+        APPEND_TAIL,
+};
+struct ocfs2_insert_type {
+        enum ocfs2_append_type  ins_appending;
+        enum ocfs2_contig_type  ins_contig;
+        int                     ins_contig_index;
+        int                     ins_free_records;
+        int                     ins_tree_depth;
+};
+/*
 * How many free extents have we got before we need more meta data?
 */
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
@@ -242,6 +424,28 @@ bail:
 }
 /*
+ * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
+ *
+ * Returns the sum of the rightmost extent rec logical offset and
+ * cluster count.
+ *
+ * ocfs2_add_branch() uses this to determine what logical cluster
+ * value should be populated into the leftmost new branch records.
+ *
+ * ocfs2_shift_tree_depth() uses this to determine the # clusters
+ * value for the new topmost tree record.
+ */
+static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
+{
+        int i;
+        i = le16_to_cpu(el->l_next_free_rec) - 1;
+        return le32_to_cpu(el->l_recs[i].e_cpos) +
+                ocfs2_rec_clusters(el, &el->l_recs[i]);
+}
+/*
 * Add an entire tree branch to our inode. eb_bh is the extent block
 * to start at, if we don't want to start the branch at the dinode
 * structure.
@@ -250,7 +454,7 @@ bail:
 * for the new last extent block.
 *
 * the new branch will be 'empty' in the sense that every block will
- * contain a single record with e_clusters == 0.
+ * contain a single record with cluster count == 0.
 */
 static int ocfs2_add_branch(struct ocfs2_super *osb,
                            handle_t *handle,
@@ -268,6 +472,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_list  *eb_el;
        struct ocfs2_extent_list  *el;
+        u32 new_cpos;
        mlog_entry_void();
@@ -302,6 +507,9 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
                goto bail;
        }
+        eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
+        new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
        /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
         * linked with the rest of the tree.
         * conversly, new_eb_bhs[0] is the new bottommost leaf.
@@ -330,9 +538,18 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
                eb->h_next_leaf_blk = 0;
                eb_el->l_tree_depth = cpu_to_le16(i);
                eb_el->l_next_free_rec = cpu_to_le16(1);
-                eb_el->l_recs[0].e_cpos = fe->i_clusters;
+                /*
+                 * This actually counts as an empty extent as
+                 * c_clusters == 0
+                 */
+                eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
                eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
-                eb_el->l_recs[0].e_clusters = cpu_to_le32(0);
+                /*
+                 * eb_el isn't always an interior node, but even leaf
+                 * nodes want a zero'd flags and reserved field so
+                 * this gets the whole 32 bits regardless of use.
+                 */
+                eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
                if (!eb_el->l_tree_depth)
                        new_last_eb_blk = le64_to_cpu(eb->h_blkno);
@@ -376,8 +593,8 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
         * either be on the fe, or the extent block passed in. */
        i = le16_to_cpu(el->l_next_free_rec);
        el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
-        el->l_recs[i].e_cpos = fe->i_clusters;
+        el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
-        el->l_recs[i].e_clusters = 0;
+        el->l_recs[i].e_int_clusters = 0;
        le16_add_cpu(&el->l_next_free_rec, 1);
        /* fe needs a new last extent block pointer, as does the
@@ -425,6 +642,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
                                  struct buffer_head **ret_new_eb_bh)
 {
        int status, i;
+        u32 new_clusters;
        struct buffer_head *new_eb_bh = NULL;
        struct ocfs2_dinode *fe;
        struct ocfs2_extent_block *eb;
@@ -461,11 +679,8 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
        /* copy the fe data into the new extent block */
        eb_el->l_tree_depth = fe_el->l_tree_depth;
        eb_el->l_next_free_rec = fe_el->l_next_free_rec;
-        for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
+        for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
-                eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos;
+                eb_el->l_recs[i] = fe_el->l_recs[i];
-                eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
-                eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
-        }
        status = ocfs2_journal_dirty(handle, new_eb_bh);
        if (status < 0) {
@@ -480,16 +695,15 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
                goto bail;
        }
+        new_clusters = ocfs2_sum_rightmost_rec(eb_el);
        /* update fe now */
        le16_add_cpu(&fe_el->l_tree_depth, 1);
        fe_el->l_recs[0].e_cpos = 0;
        fe_el->l_recs[0].e_blkno = eb->h_blkno;
-        fe_el->l_recs[0].e_clusters = fe->i_clusters;
+        fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
-        for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
+        for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
-                fe_el->l_recs[i].e_cpos = 0;
+                memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
-                fe_el->l_recs[i].e_clusters = 0;
-                fe_el->l_recs[i].e_blkno = 0;
-        }
        fe_el->l_next_free_rec = cpu_to_le16(1);
        /* If this is our 1st tree depth shift, then last_eb_blk
@@ -515,199 +729,6 @@ bail:
 }
 /*
- * Expects the tree to already have room in the rightmost leaf for the
- * extent.  Updates all the extent blocks (and the dinode) on the way
- * down.
- */
-static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
-                                  handle_t *handle,
-                                  struct inode *inode,
-                                  struct buffer_head *fe_bh,
-                                  u64 start_blk,
-                                  u32 new_clusters)
-{
-        int status, i, num_bhs = 0;
-        u64 next_blkno;
-        u16 next_free;
-        struct buffer_head **eb_bhs = NULL;
-        struct ocfs2_dinode *fe;
-        struct ocfs2_extent_block *eb;
-        struct ocfs2_extent_list  *el;
-        mlog_entry_void();
-        status = ocfs2_journal_access(handle, inode, fe_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
-        fe = (struct ocfs2_dinode *) fe_bh->b_data;
-        el = &fe->id2.i_list;
-        if (el->l_tree_depth) {
-                /* This is another operation where we want to be
-                 * careful about our tree updates. An error here means
-                 * none of the previous changes we made should roll
-                 * forward. As a result, we have to record the buffers
-                 * for this part of the tree in an array and reserve a
-                 * journal write to them before making any changes. */
-                num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth);
-                eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *),
-                                 GFP_KERNEL);
-                if (!eb_bhs) {
-                        status = -ENOMEM;
-                        mlog_errno(status);
-                        goto bail;
-                }
-                i = 0;
-                while(el->l_tree_depth) {
-                        next_free = le16_to_cpu(el->l_next_free_rec);
-                        if (next_free == 0) {
-                                ocfs2_error(inode->i_sb,
-                                            "Dinode %llu has a bad extent list",
-                                            (unsigned long long)OCFS2_I(inode)->ip_blkno);
-                                status = -EIO;
-                                goto bail;
-                        }
-                        next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno);
-                        BUG_ON(i >= num_bhs);
-                        status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i],
-                                                  OCFS2_BH_CACHED, inode);
-                        if (status < 0) {
-                                mlog_errno(status);
-                                goto bail;
-                        }
-                        eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
-                        if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
-                                                                 eb);
-                                status = -EIO;
-                                goto bail;
-                        }
-                        status = ocfs2_journal_access(handle, inode, eb_bhs[i],
-                                                      OCFS2_JOURNAL_ACCESS_WRITE);
-                        if (status < 0) {
-                                mlog_errno(status);
-                                goto bail;
-                        }
-                        el = &eb->h_list;
-                        i++;
-                        /* When we leave this loop, eb_bhs[num_bhs - 1] will
-                         * hold the bottom-most leaf extent block. */
-                }
-                BUG_ON(el->l_tree_depth);
-                el = &fe->id2.i_list;
-                /* If we have tree depth, then the fe update is
-                 * trivial, and we want to switch el out for the
-                 * bottom-most leaf in order to update it with the
-                 * actual extent data below. */
-                next_free = le16_to_cpu(el->l_next_free_rec);
-                if (next_free == 0) {
-                        ocfs2_error(inode->i_sb,
-                                    "Dinode %llu has a bad extent list",
-                                    (unsigned long long)OCFS2_I(inode)->ip_blkno);
-                        status = -EIO;
-                        goto bail;
-                }
-                le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
-                             new_clusters);
-                /* (num_bhs - 1) to avoid the leaf */
-                for(i = 0; i < (num_bhs - 1); i++) {
-                        eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
-                        el = &eb->h_list;
-                        /* finally, make our actual change to the
-                         * intermediate extent blocks. */
-                        next_free = le16_to_cpu(el->l_next_free_rec);
-                        le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
-                                     new_clusters);
-                        status = ocfs2_journal_dirty(handle, eb_bhs[i]);
-                        if (status < 0)
-                                mlog_errno(status);
-                }
-                BUG_ON(i != (num_bhs - 1));
-                /* note that the leaf block wasn't touched in
-                 * the loop above */
-                eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data;
-                el = &eb->h_list;
-                BUG_ON(el->l_tree_depth);
-        }
-        /* yay, we can finally add the actual extent now! */
-        i = le16_to_cpu(el->l_next_free_rec) - 1;
-        if (le16_to_cpu(el->l_next_free_rec) &&
-            ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) {
-                le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters);
-        } else if (le16_to_cpu(el->l_next_free_rec) &&
-                   (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) {
-                /* having an empty extent at eof is legal. */
-                if (el->l_recs[i].e_cpos != fe->i_clusters) {
-                        ocfs2_error(inode->i_sb,
-                                    "Dinode %llu trailing extent is bad: "
-                                    "cpos (%u) != number of clusters (%u)",
-                                    (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                                    le32_to_cpu(el->l_recs[i].e_cpos),
-                                    le32_to_cpu(fe->i_clusters));
-                        status = -EIO;
-                        goto bail;
-                }
-                el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
-                el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
-        } else {
-                /* No contiguous record, or no empty record at eof, so
-                 * we add a new one. */
-                BUG_ON(le16_to_cpu(el->l_next_free_rec) >=
-                       le16_to_cpu(el->l_count));
-                i = le16_to_cpu(el->l_next_free_rec);
-                el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
-                el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
-                el->l_recs[i].e_cpos = fe->i_clusters;
-                le16_add_cpu(&el->l_next_free_rec, 1);
-        }
-        /*
-         * extent_map errors are not fatal, so they are ignored outside
-         * of flushing the thing.
-         */
-        status = ocfs2_extent_map_append(inode, &el->l_recs[i],
-                                         new_clusters);
-        if (status) {
-                mlog_errno(status);
-                ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters));
-        }
-        status = ocfs2_journal_dirty(handle, fe_bh);
-        if (status < 0)
-                mlog_errno(status);
-        if (fe->id2.i_list.l_tree_depth) {
-                status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]);
-                if (status < 0)
-                        mlog_errno(status);
-        }
-        status = 0;
-bail:
-        if (eb_bhs) {
-                for (i = 0; i < num_bhs; i++)
-                        if (eb_bhs[i])
-                                brelse(eb_bhs[i]);
-                kfree(eb_bhs);
-        }
-        mlog_exit(status);
-        return status;
-}
-/*
 * Should only be called when there is no space left in any of the
 * leaf nodes. What we want to do is find the lowest tree depth
 * non-leaf extent block with room for new records. There are three
@@ -807,53 +828,1548 @@ bail:
        return status;
 }
-/* the caller needs to update fe->i_clusters */
+/*
-int ocfs2_insert_extent(struct ocfs2_super *osb,
+ * This is only valid for leaf nodes, which are the only ones that can
-                        handle_t *handle,
+ * have empty extents anyway.
-                        struct inode *inode,
+ */
-                        struct buffer_head *fe_bh,
+static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
-                        u64 start_blk,
-                        u32 new_clusters,
-                        struct ocfs2_alloc_context *meta_ac)
 {
-        int status, i, shift;
+        return !rec->e_leaf_clusters;
-        struct buffer_head *last_eb_bh = NULL;
+}
+/*
+ * This function will discard the rightmost extent record.
+ */
+static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
+{
+        int next_free = le16_to_cpu(el->l_next_free_rec);
+        int count = le16_to_cpu(el->l_count);
+        unsigned int num_bytes;
+        BUG_ON(!next_free);
+        /* This will cause us to go off the end of our extent list. */
+        BUG_ON(next_free >= count);
+        num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
+        memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
+}
+static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
+                              struct ocfs2_extent_rec *insert_rec)
+{
+        int i, insert_index, next_free, has_empty, num_bytes;
+        u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
+        struct ocfs2_extent_rec *rec;
+        next_free = le16_to_cpu(el->l_next_free_rec);
+        has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
+        BUG_ON(!next_free);
+        /* The tree code before us didn't allow enough room in the leaf. */
+        if (el->l_next_free_rec == el->l_count && !has_empty)
+                BUG();
+        /*
+         * The easiest way to approach this is to just remove the
+         * empty extent and temporarily decrement next_free.
+         */
+        if (has_empty) {
+                /*
+                 * If next_free was 1 (only an empty extent), this
+                 * loop won't execute, which is fine. We still want
+                 * the decrement above to happen.
+                 */
+                for(i = 0; i < (next_free - 1); i++)
+                        el->l_recs[i] = el->l_recs[i+1];
+                next_free--;
+        }
+        /*
+         * Figure out what the new record index should be.
+         */
+        for(i = 0; i < next_free; i++) {
+                rec = &el->l_recs[i];
+                if (insert_cpos < le32_to_cpu(rec->e_cpos))
+                        break;
+        }
+        insert_index = i;
+        mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
+             insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
+        BUG_ON(insert_index < 0);
+        BUG_ON(insert_index >= le16_to_cpu(el->l_count));
+        BUG_ON(insert_index > next_free);
+        /*
+         * No need to memmove if we're just adding to the tail.
+         */
+        if (insert_index != next_free) {
+                BUG_ON(next_free >= le16_to_cpu(el->l_count));
+                num_bytes = next_free - insert_index;
+                num_bytes *= sizeof(struct ocfs2_extent_rec);
+                memmove(&el->l_recs[insert_index + 1],
+                        &el->l_recs[insert_index],
+                        num_bytes);
+        }
+        /*
+         * Either we had an empty extent, and need to re-increment or
+         * there was no empty extent on a non full rightmost leaf node,
+         * in which case we still need to increment.
+         */
+        next_free++;
+        el->l_next_free_rec = cpu_to_le16(next_free);
+        /*
+         * Make sure none of the math above just messed up our tree.
+         */
+        BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
+        el->l_recs[insert_index] = *insert_rec;
+}
+/*
+ * Create an empty extent record .
+ *
+ * l_next_free_rec may be updated.
+ *
+ * If an empty extent already exists do nothing.
+ */
+static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
+{
+        int next_free = le16_to_cpu(el->l_next_free_rec);
+        BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+        if (next_free == 0)
+                goto set_and_inc;
+        if (ocfs2_is_empty_extent(&el->l_recs[0]))
+                return;
+        mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
+                        "Asked to create an empty extent in a full list:\n"
+                        "count = %u, tree depth = %u",
+                        le16_to_cpu(el->l_count),
+                        le16_to_cpu(el->l_tree_depth));
+        ocfs2_shift_records_right(el);
+set_and_inc:
+        le16_add_cpu(&el->l_next_free_rec, 1);
+        memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+}
+/*
+ * For a rotation which involves two leaf nodes, the "root node" is
+ * the lowest level tree node which contains a path to both leafs. This
+ * resulting set of information can be used to form a complete "subtree"
+ *
+ * This function is passed two full paths from the dinode down to a
+ * pair of adjacent leaves. It's task is to figure out which path
+ * index contains the subtree root - this can be the root index itself
+ * in a worst-case rotation.
+ *
+ * The array index of the subtree root is passed back.
+ */
+static int ocfs2_find_subtree_root(struct inode *inode,
+                                   struct ocfs2_path *left,
+                                   struct ocfs2_path *right)
+{
+        int i = 0;
+        /*
+         * Check that the caller passed in two paths from the same tree.
+         */
+        BUG_ON(path_root_bh(left) != path_root_bh(right));
+        do {
+                i++;
+                /*
+                 * The caller didn't pass two adjacent paths.
+                 */
+                mlog_bug_on_msg(i > left->p_tree_depth,
+                                "Inode %lu, left depth %u, right depth %u\n"
+                                "left leaf blk %llu, right leaf blk %llu\n",
+                                inode->i_ino, left->p_tree_depth,
+                                right->p_tree_depth,
+                                (unsigned long long)path_leaf_bh(left)->b_blocknr,
+                                (unsigned long long)path_leaf_bh(right)->b_blocknr);
+        } while (left->p_node[i].bh->b_blocknr ==
+                 right->p_node[i].bh->b_blocknr);
+        return i - 1;
+}
+typedef void (path_insert_t)(void *, struct buffer_head *);
+/*
+ * Traverse a btree path in search of cpos, starting at root_el.
+ *
+ * This code can be called with a cpos larger than the tree, in which
+ * case it will return the rightmost path.
+ */
+static int __ocfs2_find_path(struct inode *inode,
+                             struct ocfs2_extent_list *root_el, u32 cpos,
+                             path_insert_t *func, void *data)
+{
+        int i, ret = 0;
+        u32 range;
+        u64 blkno;
        struct buffer_head *bh = NULL;
-        struct ocfs2_dinode *fe;
        struct ocfs2_extent_block *eb;
-        struct ocfs2_extent_list  *el;
+        struct ocfs2_extent_list *el;
+        struct ocfs2_extent_rec *rec;
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        mlog_entry_void();
+        el = root_el;
+        while (el->l_tree_depth) {
+                if (le16_to_cpu(el->l_next_free_rec) == 0) {
+                        ocfs2_error(inode->i_sb,
+                                    "Inode %llu has empty extent list at "
+                                    "depth %u\n",
+                                    (unsigned long long)oi->ip_blkno,
+                                    le16_to_cpu(el->l_tree_depth));
+                        ret = -EROFS;
+                        goto out;
-        mlog(0, "add %u clusters starting at block %llu to inode %llu\n",
+                }
-             new_clusters, (unsigned long long)start_blk,
-             (unsigned long long)OCFS2_I(inode)->ip_blkno);
-        fe = (struct ocfs2_dinode *) fe_bh->b_data;
+                for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
-        el = &fe->id2.i_list;
+                        rec = &el->l_recs[i];
+                        /*
+                         * In the case that cpos is off the allocation
+                         * tree, this should just wind up returning the
+                         * rightmost record.
+                         */
+                        range = le32_to_cpu(rec->e_cpos) +
+                                ocfs2_rec_clusters(el, rec);
+                        if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
+                            break;
+                }
-        if (el->l_tree_depth) {
+                blkno = le64_to_cpu(el->l_recs[i].e_blkno);
-                /* jump to end of tree */
+                if (blkno == 0) {
-                status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
+                        ocfs2_error(inode->i_sb,
-                                          &last_eb_bh, OCFS2_BH_CACHED, inode);
+                                    "Inode %llu has bad blkno in extent list "
-                if (status < 0) {
+                                    "at depth %u (index %d)\n",
-                        mlog_exit(status);
+                                    (unsigned long long)oi->ip_blkno,
-                        goto bail;
+                                    le16_to_cpu(el->l_tree_depth), i);
+                        ret = -EROFS;
+                        goto out;
                }
-                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+                brelse(bh);
+                bh = NULL;
+                ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
+                                       &bh, OCFS2_BH_CACHED, inode);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                eb = (struct ocfs2_extent_block *) bh->b_data;
                el = &eb->h_list;
+                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+                        ret = -EIO;
+                        goto out;
+                }
+                if (le16_to_cpu(el->l_next_free_rec) >
+                    le16_to_cpu(el->l_count)) {
+                        ocfs2_error(inode->i_sb,
+                                    "Inode %llu has bad count in extent list "
+                                    "at block %llu (next free=%u, count=%u)\n",
+                                    (unsigned long long)oi->ip_blkno,
+                                    (unsigned long long)bh->b_blocknr,
+                                    le16_to_cpu(el->l_next_free_rec),
+                                    le16_to_cpu(el->l_count));
+                        ret = -EROFS;
+                        goto out;
+                }
+                if (func)
+                        func(data, bh);
+        }
+out:
+        /*
+         * Catch any trailing bh that the loop didn't handle.
+         */
+        brelse(bh);
+        return ret;
+}
+/*
+ * Given an initialized path (that is, it has a valid root extent
+ * list), this function will traverse the btree in search of the path
+ * which would contain cpos.
+ *
+ * The path traveled is recorded in the path structure.
+ *
+ * Note that this will not do any comparisons on leaf node extent
+ * records, so it will work fine in the case that we just added a tree
+ * branch.
+ */
+struct find_path_data {
+        int index;
+        struct ocfs2_path *path;
+};
+static void find_path_ins(void *data, struct buffer_head *bh)
+{
+        struct find_path_data *fp = data;
+        get_bh(bh);
+        ocfs2_path_insert_eb(fp->path, fp->index, bh);
+        fp->index++;
+}
+static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
+                           u32 cpos)
+{
+        struct find_path_data data;
+        data.index = 1;
+        data.path = path;
+        return __ocfs2_find_path(inode, path_root_el(path), cpos,
+                                 find_path_ins, &data);
+}
+static void find_leaf_ins(void *data, struct buffer_head *bh)
+{
+        struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
+        struct ocfs2_extent_list *el = &eb->h_list;
+        struct buffer_head **ret = data;
+        /* We want to retain only the leaf block. */
+        if (le16_to_cpu(el->l_tree_depth) == 0) {
+                get_bh(bh);
+                *ret = bh;
+        }
+}
+/*
+ * Find the leaf block in the tree which would contain cpos. No
+ * checking of the actual leaf is done.
+ *
+ * Some paths want to call this instead of allocating a path structure
+ * and calling ocfs2_find_path().
+ *
+ * This function doesn't handle non btree extent lists.
+ */
+int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
+                    u32 cpos, struct buffer_head **leaf_bh)
+{
+        int ret;
+        struct buffer_head *bh = NULL;
+        ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        *leaf_bh = bh;
+out:
+        return ret;
+}
+/*
+ * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
+ *
+ * Basically, we've moved stuff around at the bottom of the tree and
+ * we need to fix up the extent records above the changes to reflect
+ * the new changes.
+ *
+ * left_rec: the record on the left.
+ * left_child_el: is the child list pointed to by left_rec
+ * right_rec: the record to the right of left_rec
+ * right_child_el: is the child list pointed to by right_rec
+ *
+ * By definition, this only works on interior nodes.
+ */
+static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
+                                  struct ocfs2_extent_list *left_child_el,
+                                  struct ocfs2_extent_rec *right_rec,
+                                  struct ocfs2_extent_list *right_child_el)
+{
+        u32 left_clusters, right_end;
+        /*
+         * Interior nodes never have holes. Their cpos is the cpos of
+         * the leftmost record in their child list. Their cluster
+         * count covers the full theoretical range of their child list
+         * - the range between their cpos and the cpos of the record
+         * immediately to their right.
+         */
+        left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
+        left_clusters -= le32_to_cpu(left_rec->e_cpos);
+        left_rec->e_int_clusters = cpu_to_le32(left_clusters);
+        /*
+         * Calculate the rightmost cluster count boundary before
+         * moving cpos - we will need to adjust clusters after
+         * updating e_cpos to keep the same highest cluster count.
+         */
+        right_end = le32_to_cpu(right_rec->e_cpos);
+        right_end += le32_to_cpu(right_rec->e_int_clusters);
+        right_rec->e_cpos = left_rec->e_cpos;
+        le32_add_cpu(&right_rec->e_cpos, left_clusters);
+        right_end -= le32_to_cpu(right_rec->e_cpos);
+        right_rec->e_int_clusters = cpu_to_le32(right_end);
+}
+/*
+ * Adjust the adjacent root node records involved in a
+ * rotation. left_el_blkno is passed in as a key so that we can easily
+ * find it's index in the root list.
+ */
+static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
+                                      struct ocfs2_extent_list *left_el,
+                                      struct ocfs2_extent_list *right_el,
+                                      u64 left_el_blkno)
+{
+        int i;
+        BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
+               le16_to_cpu(left_el->l_tree_depth));
+        for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
+                if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
+                        break;
+        }
+        /*
+         * The path walking code should have never returned a root and
+         * two paths which are not adjacent.
+         */
+        BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
+        ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
+                                      &root_el->l_recs[i + 1], right_el);
+}
+/*
+ * We've changed a leaf block (in right_path) and need to reflect that
+ * change back up the subtree.
+ *
+ * This happens in multiple places:
+ *   - When we've moved an extent record from the left path leaf to the right
+ *     path leaf to make room for an empty extent in the left path leaf.
+ *   - When our insert into the right path leaf is at the leftmost edge
+ *     and requires an update of the path immediately to it's left. This
+ *     can occur at the end of some types of rotation and appending inserts.
+ */
+static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
+                                       struct ocfs2_path *left_path,
+                                       struct ocfs2_path *right_path,
+                                       int subtree_index)
+{
+        int ret, i, idx;
+        struct ocfs2_extent_list *el, *left_el, *right_el;
+        struct ocfs2_extent_rec *left_rec, *right_rec;
+        struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
+        /*
+         * Update the counts and position values within all the
+         * interior nodes to reflect the leaf rotation we just did.
+         *
+         * The root node is handled below the loop.
+         *
+         * We begin the loop with right_el and left_el pointing to the
+         * leaf lists and work our way up.
+         *
+         * NOTE: within this loop, left_el and right_el always refer
+         * to the *child* lists.
+         */
+        left_el = path_leaf_el(left_path);
+        right_el = path_leaf_el(right_path);
+        for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
+                mlog(0, "Adjust records at index %u\n", i);
+                /*
+                 * One nice property of knowing that all of these
+                 * nodes are below the root is that we only deal with
+                 * the leftmost right node record and the rightmost
+                 * left node record.
+                 */
+                el = left_path->p_node[i].el;
+                idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
+                left_rec = &el->l_recs[idx];
+                el = right_path->p_node[i].el;
+                right_rec = &el->l_recs[0];
+                ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
+                                              right_el);
+                ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
+                if (ret)
+                        mlog_errno(ret);
+                ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
+                if (ret)
+                        mlog_errno(ret);
+                /*
+                 * Setup our list pointers now so that the current
+                 * parents become children in the next iteration.
+                 */
+                left_el = left_path->p_node[i].el;
+                right_el = right_path->p_node[i].el;
+        }
+        /*
+         * At the root node, adjust the two adjacent records which
+         * begin our path to the leaves.
+         */
+        el = left_path->p_node[subtree_index].el;
+        left_el = left_path->p_node[subtree_index + 1].el;
+        right_el = right_path->p_node[subtree_index + 1].el;
+        ocfs2_adjust_root_records(el, left_el, right_el,
+                                  left_path->p_node[subtree_index + 1].bh->b_blocknr);
+        root_bh = left_path->p_node[subtree_index].bh;
+        ret = ocfs2_journal_dirty(handle, root_bh);
+        if (ret)
+                mlog_errno(ret);
+}
+static int ocfs2_rotate_subtree_right(struct inode *inode,
+                                      handle_t *handle,
+                                      struct ocfs2_path *left_path,
+                                      struct ocfs2_path *right_path,
+                                      int subtree_index)
+{
+        int ret, i;
+        struct buffer_head *right_leaf_bh;
+        struct buffer_head *left_leaf_bh = NULL;
+        struct buffer_head *root_bh;
+        struct ocfs2_extent_list *right_el, *left_el;
+        struct ocfs2_extent_rec move_rec;
+        left_leaf_bh = path_leaf_bh(left_path);
+        left_el = path_leaf_el(left_path);
+        if (left_el->l_next_free_rec != left_el->l_count) {
+                ocfs2_error(inode->i_sb,
+                            "Inode %llu has non-full interior leaf node %llu"
+                            "(next free = %u)",
+                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                            (unsigned long long)left_leaf_bh->b_blocknr,
+                            le16_to_cpu(left_el->l_next_free_rec));
+                return -EROFS;
+        }
+        /*
+         * This extent block may already have an empty record, so we
+         * return early if so.
+         */
+        if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
+                return 0;
+        root_bh = left_path->p_node[subtree_index].bh;
+        BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
+        ret = ocfs2_journal_access(handle, inode, root_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
+                ret = ocfs2_journal_access(handle, inode,
+                                           right_path->p_node[i].bh,
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                ret = ocfs2_journal_access(handle, inode,
+                                           left_path->p_node[i].bh,
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        right_leaf_bh = path_leaf_bh(right_path);
+        right_el = path_leaf_el(right_path);
+        /* This is a code error, not a disk corruption. */
+        mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
+                        "because rightmost leaf block %llu is empty\n",
+                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                        (unsigned long long)right_leaf_bh->b_blocknr);
+        ocfs2_create_empty_extent(right_el);
+        ret = ocfs2_journal_dirty(handle, right_leaf_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        /* Do the copy now. */
+        i = le16_to_cpu(left_el->l_next_free_rec) - 1;
+        move_rec = left_el->l_recs[i];
+        right_el->l_recs[0] = move_rec;
+        /*
+         * Clear out the record we just copied and shift everything
+         * over, leaving an empty extent in the left leaf.
+         *
+         * We temporarily subtract from next_free_rec so that the
+         * shift will lose the tail record (which is now defunct).
+         */
+        le16_add_cpu(&left_el->l_next_free_rec, -1);
+        ocfs2_shift_records_right(left_el);
+        memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+        le16_add_cpu(&left_el->l_next_free_rec, 1);
+        ret = ocfs2_journal_dirty(handle, left_leaf_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
+                                subtree_index);
+out:
+        return ret;
+}
+/*
+ * Given a full path, determine what cpos value would return us a path
+ * containing the leaf immediately to the left of the current one.
+ *
+ * Will return zero if the path passed in is already the leftmost path.
+ */
+static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
+                                         struct ocfs2_path *path, u32 *cpos)
+{
+        int i, j, ret = 0;
+        u64 blkno;
+        struct ocfs2_extent_list *el;
+        BUG_ON(path->p_tree_depth == 0);
+        *cpos = 0;
+        blkno = path_leaf_bh(path)->b_blocknr;
+        /* Start at the tree node just above the leaf and work our way up. */
+        i = path->p_tree_depth - 1;
+        while (i >= 0) {
+                el = path->p_node[i].el;
+                /*
+                 * Find the extent record just before the one in our
+                 * path.
+                 */
+                for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
+                        if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
+                                if (j == 0) {
+                                        if (i == 0) {
+                                                /*
+                                                 * We've determined that the
+                                                 * path specified is already
+                                                 * the leftmost one - return a
+                                                 * cpos of zero.
+                                                 */
+                                                goto out;
+                                        }
+                                        /*
+                                         * The leftmost record points to our
+                                         * leaf - we need to travel up the
+                                         * tree one level.
+                                         */
+                                        goto next_node;
+                                }
+                                *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
+                                *cpos = *cpos + ocfs2_rec_clusters(el,
+                                                           &el->l_recs[j - 1]);
+                                *cpos = *cpos - 1;
+                                goto out;
+                        }
+                }
+                /*
+                 * If we got here, we never found a valid node where
+                 * the tree indicated one should be.
+                 */
+                ocfs2_error(sb,
+                            "Invalid extent tree at extent block %llu\n",
+                            (unsigned long long)blkno);
+                ret = -EROFS;
+                goto out;
+next_node:
+                blkno = path->p_node[i].bh->b_blocknr;
+                i--;
+        }
+out:
+        return ret;
+}
+static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
+                                           struct ocfs2_path *path)
+{
+        int credits = (path->p_tree_depth - subtree_depth) * 2 + 1;
+        if (handle->h_buffer_credits < credits)
+                return ocfs2_extend_trans(handle, credits);
+        return 0;
+}
+/*
+ * Trap the case where we're inserting into the theoretical range past
+ * the _actual_ left leaf range. Otherwise, we'll rotate a record
+ * whose cpos is less than ours into the right leaf.
+ *
+ * It's only necessary to look at the rightmost record of the left
+ * leaf because the logic that calls us should ensure that the
+ * theoretical ranges in the path components above the leaves are
+ * correct.
+ */
+static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
+                                                 u32 insert_cpos)
+{
+        struct ocfs2_extent_list *left_el;
+        struct ocfs2_extent_rec *rec;
+        int next_free;
+        left_el = path_leaf_el(left_path);
+        next_free = le16_to_cpu(left_el->l_next_free_rec);
+        rec = &left_el->l_recs[next_free - 1];
+        if (insert_cpos > le32_to_cpu(rec->e_cpos))
+                return 1;
+        return 0;
+}
+/*
+ * Rotate all the records in a btree right one record, starting at insert_cpos.
+ *
+ * The path to the rightmost leaf should be passed in.
+ *
+ * The array is assumed to be large enough to hold an entire path (tree depth).
+ *
+ * Upon succesful return from this function:
+ *
+ * - The 'right_path' array will contain a path to the leaf block
+ *   whose range contains e_cpos.
+ * - That leaf block will have a single empty extent in list index 0.
+ * - In the case that the rotation requires a post-insert update,
+ *   *ret_left_path will contain a valid path which can be passed to
+ *   ocfs2_insert_path().
+ */
+static int ocfs2_rotate_tree_right(struct inode *inode,
+                                   handle_t *handle,
+                                   u32 insert_cpos,
+                                   struct ocfs2_path *right_path,
+                                   struct ocfs2_path **ret_left_path)
+{
+        int ret, start;
+        u32 cpos;
+        struct ocfs2_path *left_path = NULL;
+        *ret_left_path = NULL;
+        left_path = ocfs2_new_path(path_root_bh(right_path),
+                                   path_root_el(right_path));
+        if (!left_path) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
+        /*
+         * What we want to do here is:
+         *
+         * 1) Start with the rightmost path.
+         *
+         * 2) Determine a path to the leaf block directly to the left
+         *    of that leaf.
+         *
+         * 3) Determine the 'subtree root' - the lowest level tree node
+         *    which contains a path to both leaves.
+         *
+         * 4) Rotate the subtree.
+         *
+         * 5) Find the next subtree by considering the left path to be
+         *    the new right path.
+         *
+         * The check at the top of this while loop also accepts
+         * insert_cpos == cpos because cpos is only a _theoretical_
+         * value to get us the left path - insert_cpos might very well
+         * be filling that hole.
+         *
+         * Stop at a cpos of '0' because we either started at the
+         * leftmost branch (i.e., a tree with one branch and a
+         * rotation inside of it), or we've gone as far as we can in
+         * rotating subtrees.
+         */
+        while (cpos && insert_cpos <= cpos) {
+                mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
+                     insert_cpos, cpos);
+                ret = ocfs2_find_path(inode, left_path, cpos);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                mlog_bug_on_msg(path_leaf_bh(left_path) ==
+                                path_leaf_bh(right_path),
+                                "Inode %lu: error during insert of %u "
+                                "(left path cpos %u) results in two identical "
+                                "paths ending at %llu\n",
+                                inode->i_ino, insert_cpos, cpos,
+                                (unsigned long long)
+                                path_leaf_bh(left_path)->b_blocknr);
+                if (ocfs2_rotate_requires_path_adjustment(left_path,
+                                                          insert_cpos)) {
+                        mlog(0, "Path adjustment required\n");
+                        /*
+                         * We've rotated the tree as much as we
+                         * should. The rest is up to
+                         * ocfs2_insert_path() to complete, after the
+                         * record insertion. We indicate this
+                         * situation by returning the left path.
+                         *
+                         * The reason we don't adjust the records here
+                         * before the record insert is that an error
+                         * later might break the rule where a parent
+                         * record e_cpos will reflect the actual
+                         * e_cpos of the 1st nonempty record of the
+                         * child list.
+                         */
+                        *ret_left_path = left_path;
+                        goto out_ret_path;
+                }
+                start = ocfs2_find_subtree_root(inode, left_path, right_path);
+                mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
+                     start,
+                     (unsigned long long) right_path->p_node[start].bh->b_blocknr,
+                     right_path->p_tree_depth);
+                ret = ocfs2_extend_rotate_transaction(handle, start,
+                                                      right_path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
+                                                 right_path, start);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                /*
+                 * There is no need to re-read the next right path
+                 * as we know that it'll be our current left
+                 * path. Optimize by copying values instead.
+                 */
+                ocfs2_mv_path(right_path, left_path);
+                ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
+                                                    &cpos);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+out:
+        ocfs2_free_path(left_path);
+out_ret_path:
+        return ret;
+}
+/*
+ * Do the final bits of extent record insertion at the target leaf
+ * list. If this leaf is part of an allocation tree, it is assumed
+ * that the tree above has been prepared.
+ */
+static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
+                                 struct ocfs2_extent_list *el,
+                                 struct ocfs2_insert_type *insert,
+                                 struct inode *inode)
+{
+        int i = insert->ins_contig_index;
+        unsigned int range;
+        struct ocfs2_extent_rec *rec;
+        BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+        /*
+         * Contiguous insert - either left or right.
+         */
+        if (insert->ins_contig != CONTIG_NONE) {
+                rec = &el->l_recs[i];
+                if (insert->ins_contig == CONTIG_LEFT) {
+                        rec->e_blkno = insert_rec->e_blkno;
+                        rec->e_cpos = insert_rec->e_cpos;
+                }
+                le16_add_cpu(&rec->e_leaf_clusters,
+                             le16_to_cpu(insert_rec->e_leaf_clusters));
+                return;
+        }
+        /*
+         * Handle insert into an empty leaf.
+         */
+        if (le16_to_cpu(el->l_next_free_rec) == 0 ||
+            ((le16_to_cpu(el->l_next_free_rec) == 1) &&
+             ocfs2_is_empty_extent(&el->l_recs[0]))) {
+                el->l_recs[0] = *insert_rec;
+                el->l_next_free_rec = cpu_to_le16(1);
+                return;
+        }
+        /*
+         * Appending insert.
+         */
+        if (insert->ins_appending == APPEND_TAIL) {
+                i = le16_to_cpu(el->l_next_free_rec) - 1;
+                rec = &el->l_recs[i];
+                range = le32_to_cpu(rec->e_cpos)
+                        + le16_to_cpu(rec->e_leaf_clusters);
+                BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
+                mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
+                                le16_to_cpu(el->l_count),
+                                "inode %lu, depth %u, count %u, next free %u, "
+                                "rec.cpos %u, rec.clusters %u, "
+                                "insert.cpos %u, insert.clusters %u\n",
+                                inode->i_ino,
+                                le16_to_cpu(el->l_tree_depth),
+                                le16_to_cpu(el->l_count),
+                                le16_to_cpu(el->l_next_free_rec),
+                                le32_to_cpu(el->l_recs[i].e_cpos),
+                                le16_to_cpu(el->l_recs[i].e_leaf_clusters),
+                                le32_to_cpu(insert_rec->e_cpos),
+                                le16_to_cpu(insert_rec->e_leaf_clusters));
+                i++;
+                el->l_recs[i] = *insert_rec;
+                le16_add_cpu(&el->l_next_free_rec, 1);
+                return;
+        }
+        /*
+         * Ok, we have to rotate.
+         *
+         * At this point, it is safe to assume that inserting into an
+         * empty leaf and appending to a leaf have both been handled
+         * above.
+         *
+         * This leaf needs to have space, either by the empty 1st
+         * extent record, or by virtue of an l_next_rec < l_count.
+         */
+        ocfs2_rotate_leaf(el, insert_rec);
+}
+static inline void ocfs2_update_dinode_clusters(struct inode *inode,
+                                                struct ocfs2_dinode *di,
+                                                u32 clusters)
+{
+        le32_add_cpu(&di->i_clusters, clusters);
+        spin_lock(&OCFS2_I(inode)->ip_lock);
+        OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
+        spin_unlock(&OCFS2_I(inode)->ip_lock);
+}
+static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
+                                    struct ocfs2_extent_rec *insert_rec,
+                                    struct ocfs2_path *right_path,
+                                    struct ocfs2_path **ret_left_path)
+{
+        int ret, i, next_free;
+        struct buffer_head *bh;
+        struct ocfs2_extent_list *el;
+        struct ocfs2_path *left_path = NULL;
+        *ret_left_path = NULL;
+        /*
+         * This shouldn't happen for non-trees. The extent rec cluster
+         * count manipulation below only works for interior nodes.
+         */
+        BUG_ON(right_path->p_tree_depth == 0);
+        /*
+         * If our appending insert is at the leftmost edge of a leaf,
+         * then we might need to update the rightmost records of the
+         * neighboring path.
+         */
+        el = path_leaf_el(right_path);
+        next_free = le16_to_cpu(el->l_next_free_rec);
+        if (next_free == 0 ||
+            (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
+                u32 left_cpos;
+                ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
+                                                    &left_cpos);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                mlog(0, "Append may need a left path update. cpos: %u, "
+                     "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
+                     left_cpos);
+                /*
+                 * No need to worry if the append is already in the
+                 * leftmost leaf.
+                 */
+                if (left_cpos) {
+                        left_path = ocfs2_new_path(path_root_bh(right_path),
+                                                   path_root_el(right_path));
+                        if (!left_path) {
+                                ret = -ENOMEM;
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        ret = ocfs2_find_path(inode, left_path, left_cpos);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        /*
+                         * ocfs2_insert_path() will pass the left_path to the
+                         * journal for us.
+                         */
+                }
+        }
+        ret = ocfs2_journal_access_path(inode, handle, right_path);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        el = path_root_el(right_path);
+        bh = path_root_bh(right_path);
+        i = 0;
+        while (1) {
+                struct ocfs2_extent_rec *rec;
+                next_free = le16_to_cpu(el->l_next_free_rec);
+                if (next_free == 0) {
+                        ocfs2_error(inode->i_sb,
+                                    "Dinode %llu has a bad extent list",
+                                    (unsigned long long)OCFS2_I(inode)->ip_blkno);
+                        ret = -EIO;
+                        goto out;
+                }
+                rec = &el->l_recs[next_free - 1];
+                rec->e_int_clusters = insert_rec->e_cpos;
+                le32_add_cpu(&rec->e_int_clusters,
+                             le16_to_cpu(insert_rec->e_leaf_clusters));
+                le32_add_cpu(&rec->e_int_clusters,
+                             -le32_to_cpu(rec->e_cpos));
+                ret = ocfs2_journal_dirty(handle, bh);
+                if (ret)
+                        mlog_errno(ret);
+                /* Don't touch the leaf node */
+                if (++i >= right_path->p_tree_depth)
+                        break;
+                bh = right_path->p_node[i].bh;
+                el = right_path->p_node[i].el;
+        }
+        *ret_left_path = left_path;
+        ret = 0;
+out:
+        if (ret != 0)
+                ocfs2_free_path(left_path);
+        return ret;
+}
+/*
+ * This function only does inserts on an allocation b-tree. For dinode
+ * lists, ocfs2_insert_at_leaf() is called directly.
+ *
+ * right_path is the path we want to do the actual insert
+ * in. left_path should only be passed in if we need to update that
+ * portion of the tree after an edge insert.
+ */
+static int ocfs2_insert_path(struct inode *inode,
+                             handle_t *handle,
+                             struct ocfs2_path *left_path,
+                             struct ocfs2_path *right_path,
+                             struct ocfs2_extent_rec *insert_rec,
+                             struct ocfs2_insert_type *insert)
+{
+        int ret, subtree_index;
+        struct buffer_head *leaf_bh = path_leaf_bh(right_path);
+        struct ocfs2_extent_list *el;
+        /*
+         * Pass both paths to the journal. The majority of inserts
+         * will be touching all components anyway.
+         */
+        ret = ocfs2_journal_access_path(inode, handle, right_path);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
+        if (left_path) {
+                int credits = handle->h_buffer_credits;
+                /*
+                 * There's a chance that left_path got passed back to
+                 * us without being accounted for in the
+                 * journal. Extend our transaction here to be sure we
+                 * can change those blocks.
+                 */
+                credits += left_path->p_tree_depth;
+                ret = ocfs2_extend_trans(handle, credits);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                ret = ocfs2_journal_access_path(inode, handle, left_path);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        el = path_leaf_el(right_path);
+        ocfs2_insert_at_leaf(insert_rec, el, insert, inode);
+        ret = ocfs2_journal_dirty(handle, leaf_bh);
+        if (ret)
+                mlog_errno(ret);
+        if (left_path) {
+                /*
+                 * The rotate code has indicated that we need to fix
+                 * up portions of the tree after the insert.
+                 *
+                 * XXX: Should we extend the transaction here?
+                 */
+                subtree_index = ocfs2_find_subtree_root(inode, left_path,
+                                                        right_path);
+                ocfs2_complete_edge_insert(inode, handle, left_path,
+                                           right_path, subtree_index);
+        }
+        ret = 0;
+out:
+        return ret;
+}
+static int ocfs2_do_insert_extent(struct inode *inode,
+                                  handle_t *handle,
+                                  struct buffer_head *di_bh,
+                                  struct ocfs2_extent_rec *insert_rec,
+                                  struct ocfs2_insert_type *type)
+{
+        int ret, rotate = 0;
+        u32 cpos;
+        struct ocfs2_path *right_path = NULL;
+        struct ocfs2_path *left_path = NULL;
+        struct ocfs2_dinode *di;
+        struct ocfs2_extent_list *el;
+        di = (struct ocfs2_dinode *) di_bh->b_data;
+        el = &di->id2.i_list;
+        ret = ocfs2_journal_access(handle, inode, di_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        if (le16_to_cpu(el->l_tree_depth) == 0) {
+                ocfs2_insert_at_leaf(insert_rec, el, type, inode);
+                goto out_update_clusters;
+        }
+        right_path = ocfs2_new_inode_path(di_bh);
+        if (!right_path) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        /*
+         * Determine the path to start with. Rotations need the
+         * rightmost path, everything else can go directly to the
+         * target leaf.
+         */
+        cpos = le32_to_cpu(insert_rec->e_cpos);
+        if (type->ins_appending == APPEND_NONE &&
+            type->ins_contig == CONTIG_NONE) {
+                rotate = 1;
+                cpos = UINT_MAX;
+        }
+        ret = ocfs2_find_path(inode, right_path, cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        /*
+         * Rotations and appends need special treatment - they modify
+         * parts of the tree's above them.
+         *
+         * Both might pass back a path immediate to the left of the
+         * one being inserted to. This will be cause
+         * ocfs2_insert_path() to modify the rightmost records of
+         * left_path to account for an edge insert.
+         *
+         * XXX: When modifying this code, keep in mind that an insert
+         * can wind up skipping both of these two special cases...
+         */
+        if (rotate) {
+                ret = ocfs2_rotate_tree_right(inode, handle,
+                                              le32_to_cpu(insert_rec->e_cpos),
+                                              right_path, &left_path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        } else if (type->ins_appending == APPEND_TAIL
+                   && type->ins_contig != CONTIG_LEFT) {
+                ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
+                                               right_path, &left_path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        ret = ocfs2_insert_path(inode, handle, left_path, right_path,
+                                insert_rec, type);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+out_update_clusters:
+        ocfs2_update_dinode_clusters(inode, di,
+                                     le16_to_cpu(insert_rec->e_leaf_clusters));
+        ret = ocfs2_journal_dirty(handle, di_bh);
+        if (ret)
+                mlog_errno(ret);
+out:
+        ocfs2_free_path(left_path);
+        ocfs2_free_path(right_path);
+        return ret;
+}
+static void ocfs2_figure_contig_type(struct inode *inode,
+                                     struct ocfs2_insert_type *insert,
+                                     struct ocfs2_extent_list *el,
+                                     struct ocfs2_extent_rec *insert_rec)
+{
+        int i;
+        enum ocfs2_contig_type contig_type = CONTIG_NONE;
+        BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+        for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+                contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
+                                                  insert_rec);
+                if (contig_type != CONTIG_NONE) {
+                        insert->ins_contig_index = i;
+                        break;
+                }
+        }
+        insert->ins_contig = contig_type;
+}
+/*
+ * This should only be called against the righmost leaf extent list.
+ *
+ * ocfs2_figure_appending_type() will figure out whether we'll have to
+ * insert at the tail of the rightmost leaf.
+ *
+ * This should also work against the dinode list for tree's with 0
+ * depth. If we consider the dinode list to be the rightmost leaf node
+ * then the logic here makes sense.
+ */
+static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
+                                        struct ocfs2_extent_list *el,
+                                        struct ocfs2_extent_rec *insert_rec)
+{
+        int i;
+        u32 cpos = le32_to_cpu(insert_rec->e_cpos);
+        struct ocfs2_extent_rec *rec;
+        insert->ins_appending = APPEND_NONE;
+        BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+        if (!el->l_next_free_rec)
+                goto set_tail_append;
+        if (ocfs2_is_empty_extent(&el->l_recs[0])) {
+                /* Were all records empty? */
+                if (le16_to_cpu(el->l_next_free_rec) == 1)
+                        goto set_tail_append;
        }
-        /* Can we allocate without adding/shifting tree bits? */
        i = le16_to_cpu(el->l_next_free_rec) - 1;
-        if (le16_to_cpu(el->l_next_free_rec) == 0
+        rec = &el->l_recs[i];
-            || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count))
-            || le32_to_cpu(el->l_recs[i].e_clusters) == 0
+        if (cpos >=
-            || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk))
+            (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
-                goto out_add;
+                goto set_tail_append;
+        return;
+set_tail_append:
+        insert->ins_appending = APPEND_TAIL;
+}
+/*
+ * Helper function called at the begining of an insert.
+ *
+ * This computes a few things that are commonly used in the process of
+ * inserting into the btree:
+ *   - Whether the new extent is contiguous with an existing one.
+ *   - The current tree depth.
+ *   - Whether the insert is an appending one.
+ *   - The total # of free records in the tree.
+ *
+ * All of the information is stored on the ocfs2_insert_type
+ * structure.
+ */
+static int ocfs2_figure_insert_type(struct inode *inode,
+                                    struct buffer_head *di_bh,
+                                    struct buffer_head **last_eb_bh,
+                                    struct ocfs2_extent_rec *insert_rec,
+                                    struct ocfs2_insert_type *insert)
+{
+        int ret;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        struct ocfs2_extent_block *eb;
+        struct ocfs2_extent_list *el;
+        struct ocfs2_path *path = NULL;
+        struct buffer_head *bh = NULL;
+        el = &di->id2.i_list;
+        insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
+        if (el->l_tree_depth) {
+                /*
+                 * If we have tree depth, we read in the
+                 * rightmost extent block ahead of time as
+                 * ocfs2_figure_insert_type() and ocfs2_add_branch()
+                 * may want it later.
+                 */
+                ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+                                       le64_to_cpu(di->i_last_eb_blk), &bh,
+                                       OCFS2_BH_CACHED, inode);
+                if (ret) {
+                        mlog_exit(ret);
+                        goto out;
+                }
+                eb = (struct ocfs2_extent_block *) bh->b_data;
+                el = &eb->h_list;
+        }
+        /*
+         * Unless we have a contiguous insert, we'll need to know if
+         * there is room left in our allocation tree for another
+         * extent record.
+         *
+         * XXX: This test is simplistic, we can search for empty
+         * extent records too.
+         */
+        insert->ins_free_records = le16_to_cpu(el->l_count) -
+                le16_to_cpu(el->l_next_free_rec);
+        if (!insert->ins_tree_depth) {
+                ocfs2_figure_contig_type(inode, insert, el, insert_rec);
+                ocfs2_figure_appending_type(insert, el, insert_rec);
+                return 0;
+        }
+        path = ocfs2_new_inode_path(di_bh);
+        if (!path) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        /*
+         * In the case that we're inserting past what the tree
+         * currently accounts for, ocfs2_find_path() will return for
+         * us the rightmost tree path. This is accounted for below in
+         * the appending code.
+         */
+        ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        el = path_leaf_el(path);
+        /*
+         * Now that we have the path, there's two things we want to determine:
+         * 1) Contiguousness (also set contig_index if this is so)
+         *
+         * 2) Are we doing an append? We can trivially break this up
+         *     into two types of appends: simple record append, or a
+         *     rotate inside the tail leaf.
+         */
+        ocfs2_figure_contig_type(inode, insert, el, insert_rec);
+        /*
+         * The insert code isn't quite ready to deal with all cases of
+         * left contiguousness. Specifically, if it's an insert into
+         * the 1st record in a leaf, it will require the adjustment of
+         * cluster count on the last record of the path directly to it's
+         * left. For now, just catch that case and fool the layers
+         * above us. This works just fine for tree_depth == 0, which
+         * is why we allow that above.
+         */
+        if (insert->ins_contig == CONTIG_LEFT &&
+            insert->ins_contig_index == 0)
+                insert->ins_contig = CONTIG_NONE;
+        /*
+         * Ok, so we can simply compare against last_eb to figure out
+         * whether the path doesn't exist. This will only happen in
+         * the case that we're doing a tail append, so maybe we can
+         * take advantage of that information somehow.
+         */
+        if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) {
+                /*
+                 * Ok, ocfs2_find_path() returned us the rightmost
+                 * tree path. This might be an appending insert. There are
+                 * two cases:
+                 *    1) We're doing a true append at the tail:
+                 *      -This might even be off the end of the leaf
+                 *    2) We're "appending" by rotating in the tail
+                 */
+                ocfs2_figure_appending_type(insert, el, insert_rec);
+        }
+out:
+        ocfs2_free_path(path);
+        if (ret == 0)
+                *last_eb_bh = bh;
+        else
+                brelse(bh);
+        return ret;
+}
+/*
+ * Insert an extent into an inode btree.
+ *
+ * The caller needs to update fe->i_clusters
+ */
+int ocfs2_insert_extent(struct ocfs2_super *osb,
+                        handle_t *handle,
+                        struct inode *inode,
+                        struct buffer_head *fe_bh,
+                        u32 cpos,
+                        u64 start_blk,
+                        u32 new_clusters,
+                        struct ocfs2_alloc_context *meta_ac)
+{
+        int status, shift;
+        struct buffer_head *last_eb_bh = NULL;
+        struct buffer_head *bh = NULL;
+        struct ocfs2_insert_type insert = {0, };
+        struct ocfs2_extent_rec rec;
+        mlog(0, "add %u clusters at position %u to inode %llu\n",
+             new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
+        mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
+                        (OCFS2_I(inode)->ip_clusters != cpos),
+                        "Device %s, asking for sparse allocation: inode %llu, "
+                        "cpos %u, clusters %u\n",
+                        osb->dev_str,
+                        (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
+                        OCFS2_I(inode)->ip_clusters);
+        memset(&rec, 0, sizeof(rec));
+        rec.e_cpos = cpu_to_le32(cpos);
+        rec.e_blkno = cpu_to_le64(start_blk);
+        rec.e_leaf_clusters = cpu_to_le16(new_clusters);
+        status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
+                                          &insert);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail;
+        }
-        mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing "
+        mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
-             "tree now.\n");
+             "Insert.contig_index: %d, Insert.free_records: %d, "
+             "Insert.tree_depth: %d\n",
+             insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
+             insert.ins_free_records, insert.ins_tree_depth);
+        /*
+         * Avoid growing the tree unless we're out of records and the
+         * insert type requres one.
+         */
+        if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records)
+                goto out_add;
        shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
        if (shift < 0) {
@@ -866,13 +2382,9 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
         * and didn't find room for any more extents - we need to add
         * another tree level */
        if (shift) {
-                /* if we hit a leaf, we'd better be empty :) */
-                BUG_ON(le16_to_cpu(el->l_next_free_rec) !=
-                       le16_to_cpu(el->l_count));
                BUG_ON(bh);
-                mlog(0, "ocfs2_allocate_extent: need to shift tree depth "
+                mlog(0, "need to shift tree depth "
-                     "(current = %u)\n",
+                     "(current = %d)\n", insert.ins_tree_depth);
-                     le16_to_cpu(fe->id2.i_list.l_tree_depth));
                /* ocfs2_shift_tree_depth will return us a buffer with
                 * the new extent block (so we can pass that to
@@ -883,15 +2395,16 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
                        mlog_errno(status);
                        goto bail;
                }
+                insert.ins_tree_depth++;
                /* Special case: we have room now if we shifted from
                 * tree_depth 0 */
-                if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1))
+                if (insert.ins_tree_depth == 1)
                        goto out_add;
        }
        /* call ocfs2_add_branch to add the final part of the tree with
         * the new data. */
-        mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh);
+        mlog(0, "add branch. bh = %p\n", bh);
        status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
                                  meta_ac);
        if (status < 0) {
@@ -900,11 +2413,12 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
        }
 out_add:
-        /* Finally, we can add clusters. */
+        /* Finally, we can add clusters. This might rotate the tree for us. */
-        status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh,
+        status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
-                                        start_blk, new_clusters);
        if (status < 0)
                mlog_errno(status);
+        else
+                ocfs2_extent_map_insert_rec(inode, &rec);
 bail:
        if (bh)
@@ -1447,168 +2961,389 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
 * block will be deleted, and if it will, what the new last extent
 * block will be so we can update his h_next_leaf_blk field, as well
 * as the dinodes i_last_eb_blk */
-static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
+static int ocfs2_find_new_last_ext_blk(struct inode *inode,
-                                       struct inode *inode,
+                                       unsigned int clusters_to_del,
-                                       struct ocfs2_dinode *fe,
+                                       struct ocfs2_path *path,
-                                       u32 new_i_clusters,
-                                       struct buffer_head *old_last_eb,
                                       struct buffer_head **new_last_eb)
 {
-        int i, status = 0;
+        int next_free, ret = 0;
-        u64 block = 0;
+        u32 cpos;
+        struct ocfs2_extent_rec *rec;
        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_list *el;
        struct buffer_head *bh = NULL;
        *new_last_eb = NULL;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
-                status = -EIO;
-                goto bail;
-        }
        /* we have no tree, so of course, no last_eb. */
-        if (!fe->id2.i_list.l_tree_depth)
+        if (!path->p_tree_depth)
-                goto bail;
+                goto out;
        /* trunc to zero special case - this makes tree_depth = 0
         * regardless of what it is.  */
-        if (!new_i_clusters)
+        if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
-                goto bail;
+                goto out;
-        eb = (struct ocfs2_extent_block *) old_last_eb->b_data;
+        el = path_leaf_el(path);
-        el = &(eb->h_list);
        BUG_ON(!el->l_next_free_rec);
-        /* Make sure that this guy will actually be empty after we
+        /*
-         * clear away the data. */
+         * Make sure that this extent list will actually be empty
-        if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters)
+         * after we clear away the data. We can shortcut out if
-                goto bail;
+         * there's more than one non-empty extent in the
+         * list. Otherwise, a check of the remaining extent is
+         * necessary.
+         */
+        next_free = le16_to_cpu(el->l_next_free_rec);
+        rec = NULL;
+        if (ocfs2_is_empty_extent(&el->l_recs[0])) {
+                if (next_free > 2)
+                        goto out;
-        /* Ok, at this point, we know that last_eb will definitely
+                /* We may have a valid extent in index 1, check it. */
-         * change, so lets traverse the tree and find the second to
+                if (next_free == 2)
-         * last extent block. */
+                        rec = &el->l_recs[1];
-        el = &(fe->id2.i_list);
-        /* go down the tree, */
+                /*
-        do {
+                 * Fall through - no more nonempty extents, so we want
-                for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) {
+                 * to delete this leaf.
-                        if (le32_to_cpu(el->l_recs[i].e_cpos) <
+                 */
-                            new_i_clusters) {
+        } else {
-                                block = le64_to_cpu(el->l_recs[i].e_blkno);
+                if (next_free > 1)
-                                break;
+                        goto out;
-                        }
+                rec = &el->l_recs[0];
+        }
+        if (rec) {
+                /*
+                 * Check it we'll only be trimming off the end of this
+                 * cluster.
+                 */
+                if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
+                        goto out;
+        }
+        ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        eb = (struct ocfs2_extent_block *) bh->b_data;
+        el = &eb->h_list;
+        if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+                ret = -EROFS;
+                goto out;
+        }
+        *new_last_eb = bh;
+        get_bh(*new_last_eb);
+        mlog(0, "returning block %llu, (cpos: %u)\n",
+             (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
+out:
+        brelse(bh);
+        return ret;
+}
+/*
+ * Trim some clusters off the rightmost edge of a tree. Only called
+ * during truncate.
+ *
+ * The caller needs to:
+ *   - start journaling of each path component.
+ *   - compute and fully set up any new last ext block
+ */
+static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
+                           handle_t *handle, struct ocfs2_truncate_context *tc,
+                           u32 clusters_to_del, u64 *delete_start)
+{
+        int ret, i, index = path->p_tree_depth;
+        u32 new_edge = 0;
+        u64 deleted_eb = 0;
+        struct buffer_head *bh;
+        struct ocfs2_extent_list *el;
+        struct ocfs2_extent_rec *rec;
+        *delete_start = 0;
+        while (index >= 0) {
+                bh = path->p_node[index].bh;
+                el = path->p_node[index].el;
+                mlog(0, "traveling tree (index = %d, block = %llu)\n",
+                     index,  (unsigned long long)bh->b_blocknr);
+                BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
+                if (index !=
+                    (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
+                        ocfs2_error(inode->i_sb,
+                                    "Inode %lu has invalid ext. block %llu",
+                                    inode->i_ino,
+                                    (unsigned long long)bh->b_blocknr);
+                        ret = -EROFS;
+                        goto out;
                }
-                BUG_ON(i < 0);
-                if (bh) {
+find_tail_record:
-                        brelse(bh);
+                i = le16_to_cpu(el->l_next_free_rec) - 1;
-                        bh = NULL;
+                rec = &el->l_recs[i];
+                mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
+                     "next = %u\n", i, le32_to_cpu(rec->e_cpos),
+                     ocfs2_rec_clusters(el, rec),
+                     (unsigned long long)le64_to_cpu(rec->e_blkno),
+                     le16_to_cpu(el->l_next_free_rec));
+                BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
+                if (le16_to_cpu(el->l_tree_depth) == 0) {
+                        /*
+                         * If the leaf block contains a single empty
+                         * extent and no records, we can just remove
+                         * the block.
+                         */
+                        if (i == 0 && ocfs2_is_empty_extent(rec)) {
+                                memset(rec, 0,
+                                       sizeof(struct ocfs2_extent_rec));
+                                el->l_next_free_rec = cpu_to_le16(0);
+                                goto delete;
+                        }
+                        /*
+                         * Remove any empty extents by shifting things
+                         * left. That should make life much easier on
+                         * the code below. This condition is rare
+                         * enough that we shouldn't see a performance
+                         * hit.
+                         */
+                        if (ocfs2_is_empty_extent(&el->l_recs[0])) {
+                                le16_add_cpu(&el->l_next_free_rec, -1);
+                                for(i = 0;
+                                    i < le16_to_cpu(el->l_next_free_rec); i++)
+                                        el->l_recs[i] = el->l_recs[i + 1];
+                                memset(&el->l_recs[i], 0,
+                                       sizeof(struct ocfs2_extent_rec));
+                                /*
+                                 * We've modified our extent list. The
+                                 * simplest way to handle this change
+                                 * is to being the search from the
+                                 * start again.
+                                 */
+                                goto find_tail_record;
+                        }
+                        le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
+                        /*
+                         * We'll use "new_edge" on our way back up the
+                         * tree to know what our rightmost cpos is.
+                         */
+                        new_edge = le16_to_cpu(rec->e_leaf_clusters);
+                        new_edge += le32_to_cpu(rec->e_cpos);
+                        /*
+                         * The caller will use this to delete data blocks.
+                         */
+                        *delete_start = le64_to_cpu(rec->e_blkno)
+                                + ocfs2_clusters_to_blocks(inode->i_sb,
+                                        le16_to_cpu(rec->e_leaf_clusters));
+                        /*
+                         * If it's now empty, remove this record.
+                         */
+                        if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
+                                memset(rec, 0,
+                                       sizeof(struct ocfs2_extent_rec));
+                                le16_add_cpu(&el->l_next_free_rec, -1);
+                        }
+                } else {
+                        if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
+                                memset(rec, 0,
+                                       sizeof(struct ocfs2_extent_rec));
+                                le16_add_cpu(&el->l_next_free_rec, -1);
+                                goto delete;
+                        }
+                        /* Can this actually happen? */
+                        if (le16_to_cpu(el->l_next_free_rec) == 0)
+                                goto delete;
+                        /*
+                         * We never actually deleted any clusters
+                         * because our leaf was empty. There's no
+                         * reason to adjust the rightmost edge then.
+                         */
+                        if (new_edge == 0)
+                                goto delete;
+                        rec->e_int_clusters = cpu_to_le32(new_edge);
+                        le32_add_cpu(&rec->e_int_clusters,
+                                     -le32_to_cpu(rec->e_cpos));
+                         /*
+                          * A deleted child record should have been
+                          * caught above.
+                          */
+                         BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
                }
-                status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED,
+delete:
-                                         inode);
+                ret = ocfs2_journal_dirty(handle, bh);
-                if (status < 0) {
+                if (ret) {
-                        mlog_errno(status);
+                        mlog_errno(ret);
-                        goto bail;
+                        goto out;
                }
-                eb = (struct ocfs2_extent_block *) bh->b_data;
-                el = &eb->h_list;
+                mlog(0, "extent list container %llu, after: record %d: "
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+                     "(%u, %u, %llu), next = %u.\n",
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+                     (unsigned long long)bh->b_blocknr, i,
-                        status = -EIO;
+                     le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
-                        goto bail;
+                     (unsigned long long)le64_to_cpu(rec->e_blkno),
+                     le16_to_cpu(el->l_next_free_rec));
+                /*
+                 * We must be careful to only attempt delete of an
+                 * extent block (and not the root inode block).
+                 */
+                if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
+                        struct ocfs2_extent_block *eb =
+                                (struct ocfs2_extent_block *)bh->b_data;
+                        /*
+                         * Save this for use when processing the
+                         * parent block.
+                         */
+                        deleted_eb = le64_to_cpu(eb->h_blkno);
+                        mlog(0, "deleting this extent block.\n");
+                        ocfs2_remove_from_cache(inode, bh);
+                        BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
+                        BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
+                        BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
+                        if (le16_to_cpu(eb->h_suballoc_slot) == 0) {
+                                /*
+                                 * This code only understands how to
+                                 * lock the suballocator in slot 0,
+                                 * which is fine because allocation is
+                                 * only ever done out of that
+                                 * suballocator too. A future version
+                                 * might change that however, so avoid
+                                 * a free if we don't know how to
+                                 * handle it. This way an fs incompat
+                                 * bit will not be necessary.
+                                 */
+                                ret = ocfs2_free_extent_block(handle,
+                                                              tc->tc_ext_alloc_inode,
+                                                              tc->tc_ext_alloc_bh,
+                                                              eb);
+                                /* An error here is not fatal. */
+                                if (ret < 0)
+                                        mlog_errno(ret);
+                        }
+                } else {
+                        deleted_eb = 0;
                }
-        } while (el->l_tree_depth);
-        *new_last_eb = bh;
+                index--;
-        get_bh(*new_last_eb);
+        }
-        mlog(0, "returning block %llu\n",
-             (unsigned long long)le64_to_cpu(eb->h_blkno));
-bail:
-        if (bh)
-                brelse(bh);
-        return status;
+        ret = 0;
+out:
+        return ret;
 }
 static int ocfs2_do_truncate(struct ocfs2_super *osb,
                             unsigned int clusters_to_del,
                             struct inode *inode,
                             struct buffer_head *fe_bh,
-                             struct buffer_head *old_last_eb_bh,
                             handle_t *handle,
-                             struct ocfs2_truncate_context *tc)
+                             struct ocfs2_truncate_context *tc,
+                             struct ocfs2_path *path)
 {
-        int status, i, depth;
+        int status;
        struct ocfs2_dinode *fe;
-        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_block *last_eb = NULL;
        struct ocfs2_extent_list *el;
-        struct buffer_head *eb_bh = NULL;
        struct buffer_head *last_eb_bh = NULL;
-        u64 next_eb = 0;
        u64 delete_blk = 0;
        fe = (struct ocfs2_dinode *) fe_bh->b_data;
-        status = ocfs2_find_new_last_ext_blk(osb,
+        status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
-                                             inode,
+                                             path, &last_eb_bh);
-                                             fe,
-                                             le32_to_cpu(fe->i_clusters) -
-                                                        clusters_to_del,
-                                             old_last_eb_bh,
-                                             &last_eb_bh);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-        if (last_eb_bh)
-                last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-        status = ocfs2_journal_access(handle, inode, fe_bh,
+        /*
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+         * Each component will be touched, so we might as well journal
+         * here to avoid having to handle errors later.
+         */
+        status = ocfs2_journal_access_path(inode, handle, path);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
+        if (last_eb_bh) {
+                status = ocfs2_journal_access(handle, inode, last_eb_bh,
+                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+                last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+        }
        el = &(fe->id2.i_list);
+        /*
+         * Lower levels depend on this never happening, but it's best
+         * to check it up here before changing the tree.
+         */
+        if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
+                ocfs2_error(inode->i_sb,
+                            "Inode %lu has an empty extent record, depth %u\n",
+                            inode->i_ino, le16_to_cpu(el->l_tree_depth));
+                status = -EROFS;
+                goto bail;
+        }
        spin_lock(&OCFS2_I(inode)->ip_lock);
        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
                                      clusters_to_del;
        spin_unlock(&OCFS2_I(inode)->ip_lock);
        le32_add_cpu(&fe->i_clusters, -clusters_to_del);
-        fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec);
-        fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec);
-        i = le16_to_cpu(el->l_next_free_rec) - 1;
-        BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
-        le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
-        /* tree depth zero, we can just delete the clusters, otherwise
-         * we need to record the offset of the next level extent block
-         * as we may overwrite it. */
-        if (!el->l_tree_depth)
-                delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
-                        + ocfs2_clusters_to_blocks(osb->sb,
-                                        le32_to_cpu(el->l_recs[i].e_clusters));
-        else
-                next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
-        if (!el->l_recs[i].e_clusters) {
+        status = ocfs2_trim_tree(inode, path, handle, tc,
-                /* if we deleted the whole extent record, then clear
+                                 clusters_to_del, &delete_blk);
-                 * out the other fields and update the extent
+        if (status) {
-                 * list. For depth > 0 trees, we've already recorded
+                mlog_errno(status);
-                 * the extent block in 'next_eb' */
+                goto bail;
-                el->l_recs[i].e_cpos = 0;
-                el->l_recs[i].e_blkno = 0;
-                BUG_ON(!el->l_next_free_rec);
-                le16_add_cpu(&el->l_next_free_rec, -1);
        }
-        depth = le16_to_cpu(el->l_tree_depth);
+        if (le32_to_cpu(fe->i_clusters) == 0) {
-        if (!fe->i_clusters) {
                /* trunc to zero is a special case. */
                el->l_tree_depth = 0;
                fe->i_last_eb_blk = 0;
@@ -1625,12 +3360,6 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
                /* If there will be a new last extent block, then by
                 * definition, there cannot be any leaves to the right of
                 * him. */
-                status = ocfs2_journal_access(handle, inode, last_eb_bh,
-                                              OCFS2_JOURNAL_ACCESS_WRITE);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
                last_eb->h_next_leaf_blk = 0;
                status = ocfs2_journal_dirty(handle, last_eb_bh);
                if (status < 0) {
@@ -1639,123 +3368,247 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
                }
        }
-        /* if our tree depth > 0, update all the tree blocks below us. */
+        if (delete_blk) {
-        while (depth) {
+                status = ocfs2_truncate_log_append(osb, handle, delete_blk,
-                mlog(0, "traveling tree (depth = %d, next_eb = %llu)\n",
+                                                   clusters_to_del);
-                     depth,  (unsigned long long)next_eb);
-                status = ocfs2_read_block(osb, next_eb, &eb_bh,
-                                          OCFS2_BH_CACHED, inode);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
                }
-                eb = (struct ocfs2_extent_block *)eb_bh->b_data;
+        }
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+        status = 0;
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+bail:
-                        status = -EIO;
-                        goto bail;
+        mlog_exit(status);
+        return status;
+}
+static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
+{
+        set_buffer_uptodate(bh);
+        mark_buffer_dirty(bh);
+        return 0;
+}
+static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
+{
+        set_buffer_uptodate(bh);
+        mark_buffer_dirty(bh);
+        return ocfs2_journal_dirty_data(handle, bh);
+}
+static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
+                                     struct page **pages, int numpages,
+                                     u64 phys, handle_t *handle)
+{
+        int i, ret, partial = 0;
+        void *kaddr;
+        struct page *page;
+        unsigned int from, to = PAGE_CACHE_SIZE;
+        struct super_block *sb = inode->i_sb;
+        BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
+        if (numpages == 0)
+                goto out;
+        from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */
+        if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) {
+                /*
+                 * Since 'from' has been capped to a value below page
+                 * size, this calculation won't be able to overflow
+                 * 'to'
+                 */
+                to = ocfs2_align_bytes_to_clusters(sb, from);
+                /*
+                 * The truncate tail in this case should never contain
+                 * more than one page at maximum. The loop below also
+                 * assumes this.
+                 */
+                BUG_ON(numpages != 1);
+        }
+        for(i = 0; i < numpages; i++) {
+                page = pages[i];
+                BUG_ON(from > PAGE_CACHE_SIZE);
+                BUG_ON(to > PAGE_CACHE_SIZE);
+                ret = ocfs2_map_page_blocks(page, &phys, inode, from, to, 0);
+                if (ret)
+                        mlog_errno(ret);
+                kaddr = kmap_atomic(page, KM_USER0);
+                memset(kaddr + from, 0, to - from);
+                kunmap_atomic(kaddr, KM_USER0);
+                /*
+                 * Need to set the buffers we zero'd into uptodate
+                 * here if they aren't - ocfs2_map_page_blocks()
+                 * might've skipped some
+                 */
+                if (ocfs2_should_order_data(inode)) {
+                        ret = walk_page_buffers(handle,
+                                                page_buffers(page),
+                                                from, to, &partial,
+                                                ocfs2_ordered_zero_func);
+                        if (ret < 0)
+                                mlog_errno(ret);
+                } else {
+                        ret = walk_page_buffers(handle, page_buffers(page),
+                                                from, to, &partial,
+                                                ocfs2_writeback_zero_func);
+                        if (ret < 0)
+                                mlog_errno(ret);
                }
-                el = &(eb->h_list);
-                status = ocfs2_journal_access(handle, inode, eb_bh,
+                if (!partial)
-                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                        SetPageUptodate(page);
-                if (status < 0) {
-                        mlog_errno(status);
+                flush_dcache_page(page);
-                        goto bail;
+                /*
+                 * Every page after the 1st one should be completely zero'd.
+                 */
+                from = 0;
+        }
+out:
+        if (pages) {
+                for (i = 0; i < numpages; i++) {
+                        page = pages[i];
+                        unlock_page(page);
+                        mark_page_accessed(page);
+                        page_cache_release(page);
                }
+        }
+}
-                BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
+static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages,
-                BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1));
+                                int *num, u64 *phys)
+{
+        int i, numpages = 0, ret = 0;
+        unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize;
+        unsigned int ext_flags;
+        struct super_block *sb = inode->i_sb;
+        struct address_space *mapping = inode->i_mapping;
+        unsigned long index;
+        u64 next_cluster_bytes;
+        BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
+        /* Cluster boundary, so we don't need to grab any pages. */
+        if ((isize & (csize - 1)) == 0)
+                goto out;
-                i = le16_to_cpu(el->l_next_free_rec) - 1;
+        ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits,
+                                          phys, NULL, &ext_flags);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
-                mlog(0, "extent block %llu, before: record %d: "
+        /* Tail is a hole. */
-                     "(%u, %u, %llu), next = %u\n",
+        if (*phys == 0)
-                     (unsigned long long)le64_to_cpu(eb->h_blkno), i,
+                goto out;
-                     le32_to_cpu(el->l_recs[i].e_cpos),
-                     le32_to_cpu(el->l_recs[i].e_clusters),
-                     (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
-                     le16_to_cpu(el->l_next_free_rec));
-                BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
+        /* Tail is marked as unwritten, we can count on write to zero
-                le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
+         * in that case. */
+        if (ext_flags & OCFS2_EXT_UNWRITTEN)
-                next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
+                goto out;
-                /* bottom-most block requires us to delete data.*/
-                if (!el->l_tree_depth)
-                        delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
-                                + ocfs2_clusters_to_blocks(osb->sb,
-                                        le32_to_cpu(el->l_recs[i].e_clusters));
-                if (!el->l_recs[i].e_clusters) {
-                        el->l_recs[i].e_cpos = 0;
-                        el->l_recs[i].e_blkno = 0;
-                        BUG_ON(!el->l_next_free_rec);
-                        le16_add_cpu(&el->l_next_free_rec, -1);
-                }
-                mlog(0, "extent block %llu, after: record %d: "
-                     "(%u, %u, %llu), next = %u\n",
-                     (unsigned long long)le64_to_cpu(eb->h_blkno), i,
-                     le32_to_cpu(el->l_recs[i].e_cpos),
-                     le32_to_cpu(el->l_recs[i].e_clusters),
-                     (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
-                     le16_to_cpu(el->l_next_free_rec));
-                status = ocfs2_journal_dirty(handle, eb_bh);
+        next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize);
-                if (status < 0) {
+        index = isize >> PAGE_CACHE_SHIFT;
-                        mlog_errno(status);
+        do {
-                        goto bail;
+                pages[numpages] = grab_cache_page(mapping, index);
+                if (!pages[numpages]) {
+                        ret = -ENOMEM;
+                        mlog_errno(ret);
+                        goto out;
                }
-                if (!el->l_next_free_rec) {
+                numpages++;
-                        mlog(0, "deleting this extent block.\n");
+                index++;
+        } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT));
-                        ocfs2_remove_from_cache(inode, eb_bh);
-                        BUG_ON(el->l_recs[0].e_clusters);
+out:
-                        BUG_ON(el->l_recs[0].e_cpos);
+        if (ret != 0) {
-                        BUG_ON(el->l_recs[0].e_blkno);
+                if (pages) {
-                        if (eb->h_suballoc_slot == 0) {
+                        for (i = 0; i < numpages; i++) {
-                                /*
+                                if (pages[i]) {
-                                 * This code only understands how to
+                                        unlock_page(pages[i]);
-                                 * lock the suballocator in slot 0,
+                                        page_cache_release(pages[i]);
-                                 * which is fine because allocation is
-                                 * only ever done out of that
-                                 * suballocator too. A future version
-                                 * might change that however, so avoid
-                                 * a free if we don't know how to
-                                 * handle it. This way an fs incompat
-                                 * bit will not be necessary.
-                                 */
-                                status = ocfs2_free_extent_block(handle,
-                                                                 tc->tc_ext_alloc_inode,
-                                                                 tc->tc_ext_alloc_bh,
-                                                                 eb);
-                                if (status < 0) {
-                                        mlog_errno(status);
-                                        goto bail;
                                }
                        }
                }
-                brelse(eb_bh);
+                numpages = 0;
-                eb_bh = NULL;
-                depth--;
        }
-        BUG_ON(!delete_blk);
+        *num = numpages;
-        status = ocfs2_truncate_log_append(osb, handle, delete_blk,
-                                           clusters_to_del);
+        return ret;
-        if (status < 0) {
+}
-                mlog_errno(status);
-                goto bail;
+/*
+ * Zero the area past i_size but still within an allocated
+ * cluster. This avoids exposing nonzero data on subsequent file
+ * extends.
+ *
+ * We need to call this before i_size is updated on the inode because
+ * otherwise block_write_full_page() will skip writeout of pages past
+ * i_size. The new_i_size parameter is passed for this reason.
+ */
+int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
+                                 u64 new_i_size)
+{
+        int ret, numpages;
+        loff_t endbyte;
+        struct page **pages = NULL;
+        u64 phys;
+        /*
+         * File systems which don't support sparse files zero on every
+         * extend.
+         */
+        if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+                return 0;
+        pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb),
+                        sizeof(struct page *), GFP_NOFS);
+        if (pages == NULL) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
        }
-        status = 0;
-bail:
+        ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys);
-        if (!status)
+        if (ret) {
-                ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters));
+                mlog_errno(ret);
-        else
+                goto out;
-                ocfs2_extent_map_drop(inode, 0);
+        }
-        mlog_exit(status);
-        return status;
+        if (numpages == 0)
+                goto out;
+        ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys,
+                                 handle);
+        /*
+         * Initiate writeout of the pages we zero'd here. We don't
+         * wait on them - the truncate_inode_pages() call later will
+         * do that for us.
+         */
+        endbyte = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
+        ret = do_sync_mapping_range(inode->i_mapping, new_i_size,
+                                    endbyte - 1, SYNC_FILE_RANGE_WRITE);
+        if (ret)
+                mlog_errno(ret);
+out:
+        if (pages)
+                kfree(pages);
+        return ret;
 }
 /*
@@ -1770,82 +3623,90 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
                          struct ocfs2_truncate_context *tc)
 {
        int status, i, credits, tl_sem = 0;
-        u32 clusters_to_del, target_i_clusters;
+        u32 clusters_to_del, new_highest_cpos, range;
-        u64 last_eb = 0;
-        struct ocfs2_dinode *fe;
-        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_list *el;
-        struct buffer_head *last_eb_bh;
        handle_t *handle = NULL;
        struct inode *tl_inode = osb->osb_tl_inode;
+        struct ocfs2_path *path = NULL;
        mlog_entry_void();
        down_write(&OCFS2_I(inode)->ip_alloc_sem);
-        target_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
+        new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
                                                     i_size_read(inode));
-        last_eb_bh = tc->tc_last_eb_bh;
+        path = ocfs2_new_inode_path(fe_bh);
-        tc->tc_last_eb_bh = NULL;
+        if (!path) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto bail;
+        }
-        fe = (struct ocfs2_dinode *) fe_bh->b_data;
+        ocfs2_extent_map_trunc(inode, new_highest_cpos);
-        if (fe->id2.i_list.l_tree_depth) {
-                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-                el = &eb->h_list;
-        } else
-                el = &fe->id2.i_list;
-        last_eb = le64_to_cpu(fe->i_last_eb_blk);
 start:
-        mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, "
+        /*
-             "last_eb = %llu, fe->i_last_eb_blk = %llu, "
+         * Check that we still have allocation to delete.
-             "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n",
+         */
-             le32_to_cpu(fe->i_clusters), (unsigned long long)last_eb,
+        if (OCFS2_I(inode)->ip_clusters == 0) {
-             (unsigned long long)le64_to_cpu(fe->i_last_eb_blk),
+                status = 0;
-             le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh);
+                goto bail;
+        }
-        if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) {
-                mlog(0, "last_eb changed!\n");
-                BUG_ON(!fe->id2.i_list.l_tree_depth);
-                last_eb = le64_to_cpu(fe->i_last_eb_blk);
-                /* i_last_eb_blk may have changed, read it if
-                 * necessary. We don't have to worry about the
-                 * truncate to zero case here (where there becomes no
-                 * last_eb) because we never loop back after our work
-                 * is done. */
-                if (last_eb_bh) {
-                        brelse(last_eb_bh);
-                        last_eb_bh = NULL;
-                }
-                status = ocfs2_read_block(osb, last_eb,
+        /*
-                                          &last_eb_bh, OCFS2_BH_CACHED,
+         * Truncate always works against the rightmost tree branch.
-                                          inode);
+         */
-                if (status < 0) {
+        status = ocfs2_find_path(inode, path, UINT_MAX);
-                        mlog_errno(status);
+        if (status) {
-                        goto bail;
+                mlog_errno(status);
-                }
+                goto bail;
-                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+        }
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+        mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
-                        status = -EIO;
+             OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
-                        goto bail;
-                }
+        /*
-                el = &(eb->h_list);
+         * By now, el will point to the extent list on the bottom most
+         * portion of this tree. Only the tail record is considered in
+         * each pass.
+         *
+         * We handle the following cases, in order:
+         * - empty extent: delete the remaining branch
+         * - remove the entire record
+         * - remove a partial record
+         * - no record needs to be removed (truncate has completed)
+         */
+        el = path_leaf_el(path);
+        if (le16_to_cpu(el->l_next_free_rec) == 0) {
+                ocfs2_error(inode->i_sb,
+                            "Inode %llu has empty extent block at %llu\n",
+                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                            (unsigned long long)path_leaf_bh(path)->b_blocknr);
+                status = -EROFS;
+                goto bail;
        }
-        /* by now, el will point to the extent list on the bottom most
-         * portion of this tree. */
        i = le16_to_cpu(el->l_next_free_rec) - 1;
-        if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters)
+        range = le32_to_cpu(el->l_recs[i].e_cpos) +
-                clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters);
+                ocfs2_rec_clusters(el, &el->l_recs[i]);
-        else
+        if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
-                clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) +
+                clusters_to_del = 0;
+        } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
+                clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
+        } else if (range > new_highest_cpos) {
+                clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
                                   le32_to_cpu(el->l_recs[i].e_cpos)) -
-                                  target_i_clusters;
+                                  new_highest_cpos;
+        } else {
+                status = 0;
+                goto bail;
+        }
-        mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del);
+        mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
+             clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
+        BUG_ON(clusters_to_del == 0);
        mutex_lock(&tl_inode->i_mutex);
        tl_sem = 1;
@@ -1861,7 +3722,8 @@ start:
        }
        credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
-                                                fe, el);
+                                                (struct ocfs2_dinode *)fe_bh->b_data,
+                                                el);
        handle = ocfs2_start_trans(osb, credits);
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
@@ -1870,13 +3732,8 @@ start:
                goto bail;
        }
-        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+        status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
-        status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+                                   tc, path);
-        if (status < 0)
-                mlog_errno(status);
-        status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh,
-                                   last_eb_bh, handle, tc);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1888,9 +3745,14 @@ start:
        ocfs2_commit_trans(osb, handle);
        handle = NULL;
-        BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters);
+        ocfs2_reinit_path(path, 1);
-        if (le32_to_cpu(fe->i_clusters) > target_i_clusters)
-                goto start;
+        /*
+         * The check above will catch the case where we've truncated
+         * away all allocation.
+         */
+        goto start;
 bail:
        up_write(&OCFS2_I(inode)->ip_alloc_sem);
@@ -1902,8 +3764,7 @@ bail:
        if (handle)
                ocfs2_commit_trans(osb, handle);
-        if (last_eb_bh)
+        ocfs2_free_path(path);
-                brelse(last_eb_bh);
        /* This will drop the ext_alloc cluster lock for us */
        ocfs2_free_truncate_context(tc);
@@ -1912,7 +3773,6 @@ bail:
        return status;
 }
 /*
 * Expects the inode to already be locked. This will figure out which
 * inodes need to be locked and will put them on the returned truncate
@@ -1923,7 +3783,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
                           struct buffer_head *fe_bh,
                           struct ocfs2_truncate_context **tc)
 {
-        int status, metadata_delete;
+        int status, metadata_delete, i;
        unsigned int new_i_clusters;
        struct ocfs2_dinode *fe;
        struct ocfs2_extent_block *eb;
@@ -1944,21 +3804,6 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
             "%llu\n", fe->i_clusters, new_i_clusters,
             (unsigned long long)fe->i_size);
-        if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) {
-                ocfs2_error(inode->i_sb, "Dinode %llu has cluster count "
-                            "%u and size %llu whereas struct inode has "
-                            "cluster count %u and size %llu which caused an "
-                            "invalid truncate to %u clusters.",
-                            (unsigned long long)le64_to_cpu(fe->i_blkno),
-                            le32_to_cpu(fe->i_clusters),
-                            (unsigned long long)le64_to_cpu(fe->i_size),
-                            OCFS2_I(inode)->ip_clusters, i_size_read(inode),
-                            new_i_clusters);
-                mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres);
-                status = -EIO;
-                goto bail;
-        }
        *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
        if (!(*tc)) {
                status = -ENOMEM;
@@ -1986,7 +3831,15 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
                        goto bail;
                }
                el = &(eb->h_list);
-                if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters)
+                i = 0;
+                if (ocfs2_is_empty_extent(&el->l_recs[0]))
+                        i = 1;
+                /*
+                 * XXX: Should we check that next_free_rec contains
+                 * the extent?
+                 */
+                if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters)
                        metadata_delete = 1;
        }
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 0b82e8044325..fbcb5934a081 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -31,7 +31,8 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
                        handle_t *handle,
                        struct inode *inode,
                        struct buffer_head *fe_bh,
-                        u64 blkno,
+                        u32 cpos,
+                        u64 start_blk,
                        u32 new_clusters,
                        struct ocfs2_alloc_context *meta_ac);
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
@@ -70,6 +71,8 @@ struct ocfs2_truncate_context {
        struct buffer_head *tc_last_eb_bh;
 };
+int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
+                                 u64 new_i_size);
 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
                           struct inode *inode,
                           struct buffer_head *fe_bh,
@@ -79,4 +82,26 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
                          struct buffer_head *fe_bh,
                          struct ocfs2_truncate_context *tc);
+int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
+                    u32 cpos, struct buffer_head **leaf_bh);
+/*
+ * Helper function to look at the # of clusters in an extent record.
+ */
+static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el,
+                                              struct ocfs2_extent_rec *rec)
+{
+        /*
+         * Cluster count in extent records is slightly different
+         * between interior nodes and leaf nodes. This is to support
+         * unwritten extents which need a flags field in leaf node
+         * records, thus shrinking the available space for a clusters
+         * field.
+         */
+        if (el->l_tree_depth)
+                return le32_to_cpu(rec->e_int_clusters);
+        else
+                return le16_to_cpu(rec->e_leaf_clusters);
+}
 #endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 875c11443817..56963e6c46c0 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -24,6 +24,8 @@
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <asm/byteorder.h>
+#include <linux/swap.h>
+#include <linux/pipe_fs_i.h>
 #define MLOG_MASK_PREFIX ML_FILE_IO
 #include <cluster/masklog.h>
@@ -37,6 +39,7 @@
 #include "file.h"
 #include "inode.h"
 #include "journal.h"
+#include "suballoc.h"
 #include "super.h"
 #include "symlink.h"
@@ -134,7 +137,9 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
                           struct buffer_head *bh_result, int create)
 {
        int err = 0;
+        unsigned int ext_flags;
        u64 p_blkno, past_eof;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
                   (unsigned long long)iblock, bh_result, create);
@@ -149,17 +154,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
                goto bail;
        }
-        /* this can happen if another node truncs after our extend! */
+        err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL,
-        spin_lock(&OCFS2_I(inode)->ip_lock);
+                                          &ext_flags);
-        if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
-                                               OCFS2_I(inode)->ip_clusters))
-                err = -EIO;
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
-        if (err)
-                goto bail;
-        err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
-                                          NULL);
        if (err) {
                mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
                     "%llu, NULL)\n", err, inode, (unsigned long long)iblock,
@@ -167,22 +163,39 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
                goto bail;
        }
-        map_bh(bh_result, inode->i_sb, p_blkno);
+        /*
+         * ocfs2 never allocates in this function - the only time we
-        if (bh_result->b_blocknr == 0) {
+         * need to use BH_New is when we're extending i_size on a file
-                err = -EIO;
+         * system which doesn't support holes, in which case BH_New
-                mlog(ML_ERROR, "iblock = %llu p_blkno = %llu blkno=(%llu)\n",
+         * allows block_prepare_write() to zero.
-                     (unsigned long long)iblock,
+         */
-                     (unsigned long long)p_blkno,
+        mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb),
-                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+                        "ino %lu, iblock %llu\n", inode->i_ino,
-        }
+                        (unsigned long long)iblock);
+        /* Treat the unwritten extent as a hole for zeroing purposes. */
+        if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
+                map_bh(bh_result, inode->i_sb, p_blkno);
+        if (!ocfs2_sparse_alloc(osb)) {
+                if (p_blkno == 0) {
+                        err = -EIO;
+                        mlog(ML_ERROR,
+                             "iblock = %llu p_blkno = %llu blkno=(%llu)\n",
+                             (unsigned long long)iblock,
+                             (unsigned long long)p_blkno,
+                             (unsigned long long)OCFS2_I(inode)->ip_blkno);
+                        mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
+                        dump_stack();
+                }
-        past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+                past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
-        mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
+                mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
-             (unsigned long long)past_eof);
+                     (unsigned long long)past_eof);
-        if (create && (iblock >= past_eof))
+                if (create && (iblock >= past_eof))
-                set_buffer_new(bh_result);
+                        set_buffer_new(bh_result);
+        }
 bail:
        if (err < 0)
@@ -276,8 +289,11 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
        return ret;
 }
-/* This can also be called from ocfs2_write_zero_page() which has done
+/*
- * it's own cluster locking. */
+ * This is called from ocfs2_write_zero_page() which has handled it's
+ * own cluster locking and has ensured allocation exists for those
+ * blocks to be written.
+ */
 int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
                               unsigned from, unsigned to)
 {
@@ -292,44 +308,17 @@ int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
        return ret;
 }
-/*
- * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
- * from loopback.  It must be able to perform its own locking around
- * ocfs2_get_block().
- */
-static int ocfs2_prepare_write(struct file *file, struct page *page,
-                               unsigned from, unsigned to)
-{
-        struct inode *inode = page->mapping->host;
-        int ret;
-        mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
-        ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page);
-        if (ret != 0) {
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_prepare_write_nolock(inode, page, from, to);
-        ocfs2_meta_unlock(inode, 0);
-out:
-        mlog_exit(ret);
-        return ret;
-}
 /* Taken from ext3. We don't necessarily need the full blown
 * functionality yet, but IMHO it's better to cut and paste the whole
 * thing so we can avoid introducing our own bugs (and easily pick up
 * their fixes when they happen) --Mark */
-static int walk_page_buffers(   handle_t *handle,
+int walk_page_buffers(  handle_t *handle,
-                                struct buffer_head *head,
+                        struct buffer_head *head,
-                                unsigned from,
+                        unsigned from,
-                                unsigned to,
+                        unsigned to,
-                                int *partial,
+                        int *partial,
-                                int (*fn)(      handle_t *handle,
+                        int (*fn)(      handle_t *handle,
-                                                struct buffer_head *bh))
+                                        struct buffer_head *bh))
 {
        struct buffer_head *bh;
        unsigned block_start, block_end;
@@ -388,95 +377,6 @@ out:
        return handle;
 }
-static int ocfs2_commit_write(struct file *file, struct page *page,
-                              unsigned from, unsigned to)
-{
-        int ret;
-        struct buffer_head *di_bh = NULL;
-        struct inode *inode = page->mapping->host;
-        handle_t *handle = NULL;
-        struct ocfs2_dinode *di;
-        mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
-        /* NOTE: ocfs2_file_aio_write has ensured that it's safe for
-         * us to continue here without rechecking the I/O against
-         * changed inode values.
-         *
-         * 1) We're currently holding the inode alloc lock, so no
-         *    nodes can change it underneath us.
-         *
-         * 2) We've had to take the metadata lock at least once
-         *    already to check for extending writes, suid removal, etc.
-         *    The meta data update code then ensures that we don't get a
-         *    stale inode allocation image (i_size, i_clusters, etc).
-         */
-        ret = ocfs2_meta_lock_with_page(inode, &di_bh, 1, page);
-        if (ret != 0) {
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_data_lock_with_page(inode, 1, page);
-        if (ret != 0) {
-                mlog_errno(ret);
-                goto out_unlock_meta;
-        }
-        handle = ocfs2_start_walk_page_trans(inode, page, from, to);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                goto out_unlock_data;
-        }
-        /* Mark our buffer early. We'd rather catch this error up here
-         * as opposed to after a successful commit_write which would
-         * require us to set back inode->i_size. */
-        ret = ocfs2_journal_access(handle, inode, di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-        /* might update i_size */
-        ret = generic_commit_write(file, page, from, to);
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-        di = (struct ocfs2_dinode *)di_bh->b_data;
-        /* ocfs2_mark_inode_dirty() is too heavy to use here. */
-        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-        di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
-        di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
-        inode->i_blocks = ocfs2_align_bytes_to_sectors((u64)(i_size_read(inode)));
-        di->i_size = cpu_to_le64((u64)i_size_read(inode));
-        ret = ocfs2_journal_dirty(handle, di_bh);
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-out_commit:
-        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
-out_unlock_data:
-        ocfs2_data_unlock(inode, 1);
-out_unlock_meta:
-        ocfs2_meta_unlock(inode, 1);
-out:
-        if (di_bh)
-                brelse(di_bh);
-        mlog_exit(ret);
-        return ret;
-}
 static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
 {
        sector_t status;
@@ -499,8 +399,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
                down_read(&OCFS2_I(inode)->ip_alloc_sem);
        }
-        err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno,
+        err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, NULL);
-                                          NULL);
        if (!INODE_JOURNAL(inode)) {
                up_read(&OCFS2_I(inode)->ip_alloc_sem);
@@ -540,8 +439,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
                                     struct buffer_head *bh_result, int create)
 {
        int ret;
-        u64 p_blkno, inode_blocks;
+        u64 p_blkno, inode_blocks, contig_blocks;
-        int contig_blocks;
+        unsigned int ext_flags;
        unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
        unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
@@ -549,33 +448,20 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
         * nicely aligned and of the right size, so there's no need
         * for us to check any of that. */
-        spin_lock(&OCFS2_I(inode)->ip_lock);
+        inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
-        inode_blocks = ocfs2_clusters_to_blocks(inode->i_sb,
-                                                OCFS2_I(inode)->ip_clusters);
-        /*
-         * For a read which begins past the end of file, we return a hole.
-         */
-        if (!create && (iblock >= inode_blocks)) {
-                spin_unlock(&OCFS2_I(inode)->ip_lock);
-                ret = 0;
-                goto bail;
-        }
        /*
         * Any write past EOF is not allowed because we'd be extending.
         */
        if (create && (iblock + max_blocks) > inode_blocks) {
-                spin_unlock(&OCFS2_I(inode)->ip_lock);
                ret = -EIO;
                goto bail;
        }
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
        /* This figures out the size of the next contiguous block, and
         * our logical offset */
-        ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
+        ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
-                                          &contig_blocks);
+                                          &contig_blocks, &ext_flags);
        if (ret) {
                mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
                     (unsigned long long)iblock);
@@ -583,7 +469,37 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
                goto bail;
        }
-        map_bh(bh_result, inode->i_sb, p_blkno);
+        if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) {
+                ocfs2_error(inode->i_sb,
+                            "Inode %llu has a hole at block %llu\n",
+                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                            (unsigned long long)iblock);
+                ret = -EROFS;
+                goto bail;
+        }
+        /*
+         * get_more_blocks() expects us to describe a hole by clearing
+         * the mapped bit on bh_result().
+         *
+         * Consider an unwritten extent as a hole.
+         */
+        if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
+                map_bh(bh_result, inode->i_sb, p_blkno);
+        else {
+                /*
+                 * ocfs2_prepare_inode_for_write() should have caught
+                 * the case where we'd be filling a hole and triggered
+                 * a buffered write instead.
+                 */
+                if (create) {
+                        ret = -EIO;
+                        mlog_errno(ret);
+                        goto bail;
+                }
+                clear_buffer_mapped(bh_result);
+        }
        /* make sure we don't map more than max_blocks blocks here as
           that's all the kernel will handle at this point. */
@@ -606,12 +522,17 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
                             void *private)
 {
        struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+        int level;
        /* this io's submitter should not have unlocked this before we could */
        BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
        ocfs2_iocb_clear_rw_locked(iocb);
-        up_read(&inode->i_alloc_sem);
-        ocfs2_rw_unlock(inode, 0);
+        level = ocfs2_iocb_rw_locked_level(iocb);
+        if (!level)
+                up_read(&inode->i_alloc_sem);
+        ocfs2_rw_unlock(inode, level);
 }
 /*
@@ -647,23 +568,27 @@ static ssize_t ocfs2_direct_IO(int rw,
        mlog_entry_void();
-        /*
+        if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
-         * We get PR data locks even for O_DIRECT.  This allows
+                /*
-         * concurrent O_DIRECT I/O but doesn't let O_DIRECT with
+                 * We get PR data locks even for O_DIRECT.  This
-         * extending and buffered zeroing writes race.  If they did
+                 * allows concurrent O_DIRECT I/O but doesn't let
-         * race then the buffered zeroing could be written back after
+                 * O_DIRECT with extending and buffered zeroing writes
-         * the O_DIRECT I/O.  It's one thing to tell people not to mix
+                 * race.  If they did race then the buffered zeroing
-         * buffered and O_DIRECT writes, but expecting them to
+                 * could be written back after the O_DIRECT I/O.  It's
-         * understand that file extension is also an implicit buffered
+                 * one thing to tell people not to mix buffered and
-         * write is too much.  By getting the PR we force writeback of
+                 * O_DIRECT writes, but expecting them to understand
-         * the buffered zeroing before proceeding.
+                 * that file extension is also an implicit buffered
-         */
+                 * write is too much.  By getting the PR we force
-        ret = ocfs2_data_lock(inode, 0);
+                 * writeback of the buffered zeroing before
-        if (ret < 0) {
+                 * proceeding.
-                mlog_errno(ret);
+                 */
-                goto out;
+                ret = ocfs2_data_lock(inode, 0);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                ocfs2_data_unlock(inode, 0);
        }
-        ocfs2_data_unlock(inode, 0);
        ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
                                            inode->i_sb->s_bdev, iov, offset,
@@ -675,11 +600,715 @@ out:
        return ret;
 }
+static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
+                                            u32 cpos,
+                                            unsigned int *start,
+                                            unsigned int *end)
+{
+        unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE;
+        if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) {
+                unsigned int cpp;
+                cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits);
+                cluster_start = cpos % cpp;
+                cluster_start = cluster_start << osb->s_clustersize_bits;
+                cluster_end = cluster_start + osb->s_clustersize;
+        }
+        BUG_ON(cluster_start > PAGE_SIZE);
+        BUG_ON(cluster_end > PAGE_SIZE);
+        if (start)
+                *start = cluster_start;
+        if (end)
+                *end = cluster_end;
+}
+/*
+ * 'from' and 'to' are the region in the page to avoid zeroing.
+ *
+ * If pagesize > clustersize, this function will avoid zeroing outside
+ * of the cluster boundary.
+ *
+ * from == to == 0 is code for "zero the entire cluster region"
+ */
+static void ocfs2_clear_page_regions(struct page *page,
+                                     struct ocfs2_super *osb, u32 cpos,
+                                     unsigned from, unsigned to)
+{
+        void *kaddr;
+        unsigned int cluster_start, cluster_end;
+        ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end);
+        kaddr = kmap_atomic(page, KM_USER0);
+        if (from || to) {
+                if (from > cluster_start)
+                        memset(kaddr + cluster_start, 0, from - cluster_start);
+                if (to < cluster_end)
+                        memset(kaddr + to, 0, cluster_end - to);
+        } else {
+                memset(kaddr + cluster_start, 0, cluster_end - cluster_start);
+        }
+        kunmap_atomic(kaddr, KM_USER0);
+}
+/*
+ * Some of this taken from block_prepare_write(). We already have our
+ * mapping by now though, and the entire write will be allocating or
+ * it won't, so not much need to use BH_New.
+ *
+ * This will also skip zeroing, which is handled externally.
+ */
+int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
+                          struct inode *inode, unsigned int from,
+                          unsigned int to, int new)
+{
+        int ret = 0;
+        struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
+        unsigned int block_end, block_start;
+        unsigned int bsize = 1 << inode->i_blkbits;
+        if (!page_has_buffers(page))
+                create_empty_buffers(page, bsize, 0);
+        head = page_buffers(page);
+        for (bh = head, block_start = 0; bh != head || !block_start;
+             bh = bh->b_this_page, block_start += bsize) {
+                block_end = block_start + bsize;
+                /*
+                 * Ignore blocks outside of our i/o range -
+                 * they may belong to unallocated clusters.
+                 */
+                if (block_start >= to || block_end <= from) {
+                        if (PageUptodate(page))
+                                set_buffer_uptodate(bh);
+                        continue;
+                }
+                /*
+                 * For an allocating write with cluster size >= page
+                 * size, we always write the entire page.
+                 */
+                if (buffer_new(bh))
+                        clear_buffer_new(bh);
+                if (!buffer_mapped(bh)) {
+                        map_bh(bh, inode->i_sb, *p_blkno);
+                        unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+                }
+                if (PageUptodate(page)) {
+                        if (!buffer_uptodate(bh))
+                                set_buffer_uptodate(bh);
+                } else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
+                     (block_start < from || block_end > to)) {
+                        ll_rw_block(READ, 1, &bh);
+                        *wait_bh++=bh;
+                }
+                *p_blkno = *p_blkno + 1;
+        }
+        /*
+         * If we issued read requests - let them complete.
+         */
+        while(wait_bh > wait) {
+                wait_on_buffer(*--wait_bh);
+                if (!buffer_uptodate(*wait_bh))
+                        ret = -EIO;
+        }
+        if (ret == 0 || !new)
+                return ret;
+        /*
+         * If we get -EIO above, zero out any newly allocated blocks
+         * to avoid exposing stale data.
+         */
+        bh = head;
+        block_start = 0;
+        do {
+                void *kaddr;
+                block_end = block_start + bsize;
+                if (block_end <= from)
+                        goto next_bh;
+                if (block_start >= to)
+                        break;
+                kaddr = kmap_atomic(page, KM_USER0);
+                memset(kaddr+block_start, 0, bh->b_size);
+                flush_dcache_page(page);
+                kunmap_atomic(kaddr, KM_USER0);
+                set_buffer_uptodate(bh);
+                mark_buffer_dirty(bh);
+next_bh:
+                block_start = block_end;
+                bh = bh->b_this_page;
+        } while (bh != head);
+        return ret;
+}
+/*
+ * This will copy user data from the buffer page in the splice
+ * context.
+ *
+ * For now, we ignore SPLICE_F_MOVE as that would require some extra
+ * communication out all the way to ocfs2_write().
+ */
+int ocfs2_map_and_write_splice_data(struct inode *inode,
+                                  struct ocfs2_write_ctxt *wc, u64 *p_blkno,
+                                  unsigned int *ret_from, unsigned int *ret_to)
+{
+        int ret;
+        unsigned int to, from, cluster_start, cluster_end;
+        char *src, *dst;
+        struct ocfs2_splice_write_priv *sp = wc->w_private;
+        struct pipe_buffer *buf = sp->s_buf;
+        unsigned long bytes, src_from;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
+                                        &cluster_end);
+        from = sp->s_offset;
+        src_from = sp->s_buf_offset;
+        bytes = wc->w_count;
+        if (wc->w_large_pages) {
+                /*
+                 * For cluster size < page size, we have to
+                 * calculate pos within the cluster and obey
+                 * the rightmost boundary.
+                 */
+                bytes = min(bytes, (unsigned long)(osb->s_clustersize
+                                   - (wc->w_pos & (osb->s_clustersize - 1))));
+        }
+        to = from + bytes;
+        if (wc->w_this_page_new)
+                ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
+                                            cluster_start, cluster_end, 1);
+        else
+                ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
+                                            from, to, 0);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        BUG_ON(from > PAGE_CACHE_SIZE);
+        BUG_ON(to > PAGE_CACHE_SIZE);
+        BUG_ON(from > osb->s_clustersize);
+        BUG_ON(to > osb->s_clustersize);
+        src = buf->ops->map(sp->s_pipe, buf, 1);
+        dst = kmap_atomic(wc->w_this_page, KM_USER1);
+        memcpy(dst + from, src + src_from, bytes);
+        kunmap_atomic(wc->w_this_page, KM_USER1);
+        buf->ops->unmap(sp->s_pipe, buf, src);
+        wc->w_finished_copy = 1;
+        *ret_from = from;
+        *ret_to = to;
+out:
+        return bytes ? (unsigned int)bytes : ret;
+}
+/*
+ * This will copy user data from the iovec in the buffered write
+ * context.
+ */
+int ocfs2_map_and_write_user_data(struct inode *inode,
+                                  struct ocfs2_write_ctxt *wc, u64 *p_blkno,
+                                  unsigned int *ret_from, unsigned int *ret_to)
+{
+        int ret;
+        unsigned int to, from, cluster_start, cluster_end;
+        unsigned long bytes, src_from;
+        char *dst;
+        struct ocfs2_buffered_write_priv *bp = wc->w_private;
+        const struct iovec *cur_iov = bp->b_cur_iov;
+        char __user *buf;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
+                                        &cluster_end);
+        buf = cur_iov->iov_base + bp->b_cur_off;
+        src_from = (unsigned long)buf & ~PAGE_CACHE_MASK;
+        from = wc->w_pos & (PAGE_CACHE_SIZE - 1);
+        /*
+         * This is a lot of comparisons, but it reads quite
+         * easily, which is important here.
+         */
+        /* Stay within the src page */
+        bytes = PAGE_SIZE - src_from;
+        /* Stay within the vector */
+        bytes = min(bytes,
+                    (unsigned long)(cur_iov->iov_len - bp->b_cur_off));
+        /* Stay within count */
+        bytes = min(bytes, (unsigned long)wc->w_count);
+        /*
+         * For clustersize > page size, just stay within
+         * target page, otherwise we have to calculate pos
+         * within the cluster and obey the rightmost
+         * boundary.
+         */
+        if (wc->w_large_pages) {
+                /*
+                 * For cluster size < page size, we have to
+                 * calculate pos within the cluster and obey
+                 * the rightmost boundary.
+                 */
+                bytes = min(bytes, (unsigned long)(osb->s_clustersize
+                                   - (wc->w_pos & (osb->s_clustersize - 1))));
+        } else {
+                /*
+                 * cluster size > page size is the most common
+                 * case - we just stay within the target page
+                 * boundary.
+                 */
+                bytes = min(bytes, PAGE_CACHE_SIZE - from);
+        }
+        to = from + bytes;
+        if (wc->w_this_page_new)
+                ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
+                                            cluster_start, cluster_end, 1);
+        else
+                ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
+                                            from, to, 0);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        BUG_ON(from > PAGE_CACHE_SIZE);
+        BUG_ON(to > PAGE_CACHE_SIZE);
+        BUG_ON(from > osb->s_clustersize);
+        BUG_ON(to > osb->s_clustersize);
+        dst = kmap(wc->w_this_page);
+        memcpy(dst + from, bp->b_src_buf + src_from, bytes);
+        kunmap(wc->w_this_page);
+        /*
+         * XXX: This is slow, but simple. The caller of
+         * ocfs2_buffered_write_cluster() is responsible for
+         * passing through the iovecs, so it's difficult to
+         * predict what our next step is in here after our
+         * initial write. A future version should be pushing
+         * that iovec manipulation further down.
+         *
+         * By setting this, we indicate that a copy from user
+         * data was done, and subsequent calls for this
+         * cluster will skip copying more data.
+         */
+        wc->w_finished_copy = 1;
+        *ret_from = from;
+        *ret_to = to;
+out:
+        return bytes ? (unsigned int)bytes : ret;
+}
+/*
+ * Map, fill and write a page to disk.
+ *
+ * The work of copying data is done via callback.  Newly allocated
+ * pages which don't take user data will be zero'd (set 'new' to
+ * indicate an allocating write)
+ *
+ * Returns a negative error code or the number of bytes copied into
+ * the page.
+ */
+int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
+                          u64 *p_blkno, struct page *page,
+                          struct ocfs2_write_ctxt *wc, int new)
+{
+        int ret, copied = 0;
+        unsigned int from = 0, to = 0;
+        unsigned int cluster_start, cluster_end;
+        unsigned int zero_from = 0, zero_to = 0;
+        ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos,
+                                        &cluster_start, &cluster_end);
+        if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index
+            && !wc->w_finished_copy) {
+                wc->w_this_page = page;
+                wc->w_this_page_new = new;
+                ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                copied = ret;
+                zero_from = from;
+                zero_to = to;
+                if (new) {
+                        from = cluster_start;
+                        to = cluster_end;
+                }
+        } else {
+                /*
+                 * If we haven't allocated the new page yet, we
+                 * shouldn't be writing it out without copying user
+                 * data. This is likely a math error from the caller.
+                 */
+                BUG_ON(!new);
+                from = cluster_start;
+                to = cluster_end;
+                ret = ocfs2_map_page_blocks(page, p_blkno, inode,
+                                            cluster_start, cluster_end, 1);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        /*
+         * Parts of newly allocated pages need to be zero'd.
+         *
+         * Above, we have also rewritten 'to' and 'from' - as far as
+         * the rest of the function is concerned, the entire cluster
+         * range inside of a page needs to be written.
+         *
+         * We can skip this if the page is up to date - it's already
+         * been zero'd from being read in as a hole.
+         */
+        if (new && !PageUptodate(page))
+                ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
+                                         wc->w_cpos, zero_from, zero_to);
+        flush_dcache_page(page);
+        if (ocfs2_should_order_data(inode)) {
+                ret = walk_page_buffers(handle,
+                                        page_buffers(page),
+                                        from, to, NULL,
+                                        ocfs2_journal_dirty_data);
+                if (ret < 0)
+                        mlog_errno(ret);
+        }
+        /*
+         * We don't use generic_commit_write() because we need to
+         * handle our own i_size update.
+         */
+        ret = block_commit_write(page, from, to);
+        if (ret)
+                mlog_errno(ret);
+out:
+        return copied ? copied : ret;
+}
+/*
+ * Do the actual write of some data into an inode. Optionally allocate
+ * in order to fulfill the write.
+ *
+ * cpos is the logical cluster offset within the file to write at
+ *
+ * 'phys' is the physical mapping of that offset. a 'phys' value of
+ * zero indicates that allocation is required. In this case, data_ac
+ * and meta_ac should be valid (meta_ac can be null if metadata
+ * allocation isn't required).
+ */
+static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
+                           struct buffer_head *di_bh,
+                           struct ocfs2_alloc_context *data_ac,
+                           struct ocfs2_alloc_context *meta_ac,
+                           struct ocfs2_write_ctxt *wc)
+{
+        int ret, i, numpages = 1, new;
+        unsigned int copied = 0;
+        u32 tmp_pos;
+        u64 v_blkno, p_blkno;
+        struct address_space *mapping = file->f_mapping;
+        struct inode *inode = mapping->host;
+        unsigned long index, start;
+        struct page **cpages;
+        new = phys == 0 ? 1 : 0;
+        /*
+         * Figure out how many pages we'll be manipulating here. For
+         * non allocating write, we just change the one
+         * page. Otherwise, we'll need a whole clusters worth.
+         */
+        if (new)
+                numpages = ocfs2_pages_per_cluster(inode->i_sb);
+        cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS);
+        if (!cpages) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                return ret;
+        }
+        /*
+         * Fill our page array first. That way we've grabbed enough so
+         * that we can zero and flush if we error after adding the
+         * extent.
+         */
+        if (new) {
+                start = ocfs2_align_clusters_to_page_index(inode->i_sb,
+                                                           wc->w_cpos);
+                v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos);
+        } else {
+                start = wc->w_pos >> PAGE_CACHE_SHIFT;
+                v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits;
+        }
+        for(i = 0; i < numpages; i++) {
+                index = start + i;
+                cpages[i] = grab_cache_page(mapping, index);
+                if (!cpages[i]) {
+                        ret = -ENOMEM;
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        if (new) {
+                /*
+                 * This is safe to call with the page locks - it won't take
+                 * any additional semaphores or cluster locks.
+                 */
+                tmp_pos = wc->w_cpos;
+                ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
+                                                 &tmp_pos, 1, di_bh, handle,
+                                                 data_ac, meta_ac, NULL);
+                /*
+                 * This shouldn't happen because we must have already
+                 * calculated the correct meta data allocation required. The
+                 * internal tree allocation code should know how to increase
+                 * transaction credits itself.
+                 *
+                 * If need be, we could handle -EAGAIN for a
+                 * RESTART_TRANS here.
+                 */
+                mlog_bug_on_msg(ret == -EAGAIN,
+                                "Inode %llu: EAGAIN return during allocation.\n",
+                                (unsigned long long)OCFS2_I(inode)->ip_blkno);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
+                                          NULL);
+        if (ret < 0) {
+                /*
+                 * XXX: Should we go readonly here?
+                 */
+                mlog_errno(ret);
+                goto out;
+        }
+        BUG_ON(p_blkno == 0);
+        for(i = 0; i < numpages; i++) {
+                ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i],
+                                            wc, new);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                copied += ret;
+        }
+out:
+        for(i = 0; i < numpages; i++) {
+                unlock_page(cpages[i]);
+                mark_page_accessed(cpages[i]);
+                page_cache_release(cpages[i]);
+        }
+        kfree(cpages);
+        return copied ? copied : ret;
+}
+static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc,
+                                  struct ocfs2_super *osb, loff_t pos,
+                                  size_t count, ocfs2_page_writer *cb,
+                                  void *cb_priv)
+{
+        wc->w_count = count;
+        wc->w_pos = pos;
+        wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits;
+        wc->w_finished_copy = 0;
+        if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
+                wc->w_large_pages = 1;
+        else
+                wc->w_large_pages = 0;
+        wc->w_write_data_page = cb;
+        wc->w_private = cb_priv;
+}
+/*
+ * Write a cluster to an inode. The cluster may not be allocated yet,
+ * in which case it will be. This only exists for buffered writes -
+ * O_DIRECT takes a more "traditional" path through the kernel.
+ *
+ * The caller is responsible for incrementing pos, written counts, etc
+ *
+ * For file systems that don't support sparse files, pre-allocation
+ * and page zeroing up until cpos should be done prior to this
+ * function call.
+ *
+ * Callers should be holding i_sem, and the rw cluster lock.
+ *
+ * Returns the number of user bytes written, or less than zero for
+ * error.
+ */
+ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
+                                     size_t count, ocfs2_page_writer *actor,
+                                     void *priv)
+{
+        int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
+        ssize_t written = 0;
+        u32 phys;
+        struct inode *inode = file->f_mapping->host;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct buffer_head *di_bh = NULL;
+        struct ocfs2_dinode *di;
+        struct ocfs2_alloc_context *data_ac = NULL;
+        struct ocfs2_alloc_context *meta_ac = NULL;
+        handle_t *handle;
+        struct ocfs2_write_ctxt wc;
+        ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv);
+        ret = ocfs2_meta_lock(inode, &di_bh, 1);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        di = (struct ocfs2_dinode *)di_bh->b_data;
+        /*
+         * Take alloc sem here to prevent concurrent lookups. That way
+         * the mapping, zeroing and tree manipulation within
+         * ocfs2_write() will be safe against ->readpage(). This
+         * should also serve to lock out allocation from a shared
+         * writeable region.
+         */
+        down_write(&OCFS2_I(inode)->ip_alloc_sem);
+        ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_meta;
+        }
+        /* phys == 0 means that allocation is required. */
+        if (phys == 0) {
+                ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out_meta;
+                }
+                credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1);
+        }
+        ret = ocfs2_data_lock(inode, 1);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_meta;
+        }
+        handle = ocfs2_start_trans(osb, credits);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto out_data;
+        }
+        written = ocfs2_write(file, phys, handle, di_bh, data_ac,
+                              meta_ac, &wc);
+        if (written < 0) {
+                ret = written;
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ret = ocfs2_journal_access(handle, inode, di_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        pos += written;
+        if (pos > inode->i_size) {
+                i_size_write(inode, pos);
+                mark_inode_dirty(inode);
+        }
+        inode->i_blocks = ocfs2_inode_sector_count(inode);
+        di->i_size = cpu_to_le64((u64)i_size_read(inode));
+        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+        di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+        di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+        ret = ocfs2_journal_dirty(handle, di_bh);
+        if (ret)
+                mlog_errno(ret);
+out_commit:
+        ocfs2_commit_trans(osb, handle);
+out_data:
+        ocfs2_data_unlock(inode, 1);
+out_meta:
+        up_write(&OCFS2_I(inode)->ip_alloc_sem);
+        ocfs2_meta_unlock(inode, 1);
+out:
+        brelse(di_bh);
+        if (data_ac)
+                ocfs2_free_alloc_context(data_ac);
+        if (meta_ac)
+                ocfs2_free_alloc_context(meta_ac);
+        return written ? written : ret;
+}
 const struct address_space_operations ocfs2_aops = {
        .readpage       = ocfs2_readpage,
        .writepage      = ocfs2_writepage,
-        .prepare_write  = ocfs2_prepare_write,
-        .commit_write   = ocfs2_commit_write,
        .bmap           = ocfs2_bmap,
        .sync_page      = block_sync_page,
        .direct_IO      = ocfs2_direct_IO,
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index f446a15eab88..45821d479b5a 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -30,12 +30,83 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
                                                         unsigned from,
                                                         unsigned to);
+int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
+                          struct inode *inode, unsigned int from,
+                          unsigned int to, int new);
+int walk_page_buffers(  handle_t *handle,
+                        struct buffer_head *head,
+                        unsigned from,
+                        unsigned to,
+                        int *partial,
+                        int (*fn)(      handle_t *handle,
+                                        struct buffer_head *bh));
+struct ocfs2_write_ctxt;
+typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *,
+                                u64 *, unsigned int *, unsigned int *);
+ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
+                                     size_t count, ocfs2_page_writer *actor,
+                                     void *priv);
+struct ocfs2_write_ctxt {
+        size_t                          w_count;
+        loff_t                          w_pos;
+        u32                             w_cpos;
+        unsigned int                    w_finished_copy;
+        /* This is true if page_size > cluster_size */
+        unsigned int                    w_large_pages;
+        /* Filler callback and private data */
+        ocfs2_page_writer               *w_write_data_page;
+        void                            *w_private;
+        /* Only valid for the filler callback */
+        struct page                     *w_this_page;
+        unsigned int                    w_this_page_new;
+};
+struct ocfs2_buffered_write_priv {
+        char                            *b_src_buf;
+        const struct iovec              *b_cur_iov; /* Current iovec */
+        size_t                          b_cur_off; /* Offset in the
+                                                    * current iovec */
+};
+int ocfs2_map_and_write_user_data(struct inode *inode,
+                                  struct ocfs2_write_ctxt *wc,
+                                  u64 *p_blkno,
+                                  unsigned int *ret_from,
+                                  unsigned int *ret_to);
+struct ocfs2_splice_write_priv {
+        struct splice_desc              *s_sd;
+        struct pipe_buffer              *s_buf;
+        struct pipe_inode_info          *s_pipe;
+        /* Neither offset value is ever larger than one page */
+        unsigned int                    s_offset;
+        unsigned int                    s_buf_offset;
+};
+int ocfs2_map_and_write_splice_data(struct inode *inode,
+                                    struct ocfs2_write_ctxt *wc,
+                                    u64 *p_blkno,
+                                    unsigned int *ret_from,
+                                    unsigned int *ret_to);
 /* all ocfs2_dio_end_io()'s fault */
 #define ocfs2_iocb_is_rw_locked(iocb) \
        test_bit(0, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_set_rw_locked(iocb) \
+static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
-        set_bit(0, (unsigned long *)&iocb->private)
+{
+        set_bit(0, (unsigned long *)&iocb->private);
+        if (level)
+                set_bit(1, (unsigned long *)&iocb->private);
+        else
+                clear_bit(1, (unsigned long *)&iocb->private);
+}
 #define ocfs2_iocb_clear_rw_locked(iocb) \
        clear_bit(0, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_rw_locked_level(iocb) \
+        test_bit(1, (unsigned long *)&iocb->private)
 #endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index 4705d659fe57..bbacf7da48a4 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -46,6 +46,7 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/workqueue.h>
+#include <linux/reboot.h>
 #include "heartbeat.h"
 #include "nodemanager.h"
@@ -72,7 +73,9 @@ static void o2quo_fence_self(void)
        /* panic spins with interrupts enabled.  with preempt
         * threads can still schedule, etc, etc */
        o2hb_stop_all_regions();
-        panic("ocfs2 is very sorry to be fencing this system by panicing\n");
+        printk("ocfs2 is very sorry to be fencing this system by restarting\n");
+        emergency_restart();
 }
 /* Indicate that a timeout occured on a hearbeat region write. The
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 4dae5df5e467..9606111fe89d 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -38,6 +38,9 @@
 * locking semantics of the file system using the protocol.  It should 
 * be somewhere else, I'm sure, but right now it isn't.
 *
+ * New in version 8:
+ *      - Replace delete inode votes with a cluster lock
+ *
 * New in version 7:
 *      - DLM join domain includes the live nodemap
 *
@@ -57,7 +60,7 @@
 *      - full 64 bit i_size in the metadata lock lvbs
 *      - introduction of "rw" lock and pushing meta/data locking down
 */
-#define O2NET_PROTOCOL_VERSION 7ULL
+#define O2NET_PROTOCOL_VERSION 8ULL
 struct o2net_handshake {
        __be64  protocol_version;
        __be64  connector_id;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 66821e178167..67e6866a2a4f 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -358,15 +358,17 @@ int ocfs2_do_extend_dir(struct super_block *sb,
 {
        int status;
        int extend;
-        u64 p_blkno;
+        u64 p_blkno, v_blkno;
        spin_lock(&OCFS2_I(dir)->ip_lock);
        extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters));
        spin_unlock(&OCFS2_I(dir)->ip_lock);
        if (extend) {
-                status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1,
+                u32 offset = OCFS2_I(dir)->ip_clusters;
-                                                    parent_fe_bh, handle,
+                status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset,
+                                                    1, parent_fe_bh, handle,
                                                    data_ac, meta_ac, NULL);
                BUG_ON(status == -EAGAIN);
                if (status < 0) {
@@ -375,9 +377,8 @@ int ocfs2_do_extend_dir(struct super_block *sb,
                }
        }
-        status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >>
+        v_blkno = ocfs2_blocks_for_bytes(sb, i_size_read(dir));
-                                                   (sb->s_blocksize_bits - 9)),
+        status = ocfs2_extent_map_get_blocks(dir, v_blkno, &p_blkno, NULL, NULL);
-                                             1, &p_blkno, NULL);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -486,7 +487,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
        dir_i_size += dir->i_sb->s_blocksize;
        i_size_write(dir, dir_i_size);
-        dir->i_blocks = ocfs2_align_bytes_to_sectors(dir_i_size);
+        dir->i_blocks = ocfs2_inode_sector_count(dir);
        status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
        if (status < 0) {
                mlog_errno(status);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index c558442a0b44..d836b98dd99a 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -430,11 +430,10 @@ redo_bucket:
                        dlm_lockres_put(res);
-                        cond_resched_lock(&dlm->spinlock);
                        if (dropped)
                                goto redo_bucket;
                }
+                cond_resched_lock(&dlm->spinlock);
                num += n;
                mlog(0, "%s: touched %d lockreses in bucket %d "
                     "(tot=%d)\n", dlm->name, n, i, num);
@@ -1035,7 +1034,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
 {
        int status = 0, tmpstat, node;
        struct domain_join_ctxt *ctxt;
-        enum dlm_query_join_response response;
+        enum dlm_query_join_response response = JOIN_DISALLOW;
        mlog_entry("%p", dlm);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 6d4a83d50152..c1807a42c49f 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -611,6 +611,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                        }
                } while (status != 0);
+                spin_lock(&dlm_reco_state_lock);
                switch (ndata->state) {
                        case DLM_RECO_NODE_DATA_INIT:
                        case DLM_RECO_NODE_DATA_FINALIZE_SENT:
@@ -641,6 +642,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                                     ndata->node_num, dead_node);
                                break;
                }
+                spin_unlock(&dlm_reco_state_lock);
        }
        mlog(0, "done requesting all lock info\n");
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index e335541727f9..27e43b0c0eae 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -225,11 +225,17 @@ static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
        .flags          = 0,
 };
+static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
+        .get_osb        = ocfs2_get_inode_osb,
+        .flags          = 0,
+};
 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
 {
        return lockres->l_type == OCFS2_LOCK_TYPE_META ||
                lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
-                lockres->l_type == OCFS2_LOCK_TYPE_RW;
+                lockres->l_type == OCFS2_LOCK_TYPE_RW ||
+                lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
 }
 static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
@@ -373,6 +379,9 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
                case OCFS2_LOCK_TYPE_DATA:
                        ops = &ocfs2_inode_data_lops;
                        break;
+                case OCFS2_LOCK_TYPE_OPEN:
+                        ops = &ocfs2_inode_open_lops;
+                        break;
                default:
                        mlog_bug_on_msg(1, "type: %d\n", type);
                        ops = NULL; /* thanks, gcc */
@@ -1129,6 +1138,12 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
                goto bail;
        }
+        ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0);
+        if (ret) {
+                mlog_errno(ret);
+                goto bail;
+        }
 bail:
        mlog_exit(ret);
        return ret;
@@ -1182,6 +1197,99 @@ void ocfs2_rw_unlock(struct inode *inode, int write)
        mlog_exit_void();
 }
+/*
+ * ocfs2_open_lock always get PR mode lock.
+ */
+int ocfs2_open_lock(struct inode *inode)
+{
+        int status = 0;
+        struct ocfs2_lock_res *lockres;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        BUG_ON(!inode);
+        mlog_entry_void();
+        mlog(0, "inode %llu take PRMODE open lock\n",
+             (unsigned long long)OCFS2_I(inode)->ip_blkno);
+        if (ocfs2_mount_local(osb))
+                goto out;
+        lockres = &OCFS2_I(inode)->ip_open_lockres;
+        status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
+                                    LKM_PRMODE, 0, 0);
+        if (status < 0)
+                mlog_errno(status);
+out:
+        mlog_exit(status);
+        return status;
+}
+int ocfs2_try_open_lock(struct inode *inode, int write)
+{
+        int status = 0, level;
+        struct ocfs2_lock_res *lockres;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        BUG_ON(!inode);
+        mlog_entry_void();
+        mlog(0, "inode %llu try to take %s open lock\n",
+             (unsigned long long)OCFS2_I(inode)->ip_blkno,
+             write ? "EXMODE" : "PRMODE");
+        if (ocfs2_mount_local(osb))
+                goto out;
+        lockres = &OCFS2_I(inode)->ip_open_lockres;
+        level = write ? LKM_EXMODE : LKM_PRMODE;
+        /*
+         * The file system may already holding a PRMODE/EXMODE open lock.
+         * Since we pass LKM_NOQUEUE, the request won't block waiting on
+         * other nodes and the -EAGAIN will indicate to the caller that
+         * this inode is still in use.
+         */
+        status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
+                                    level, LKM_NOQUEUE, 0);
+out:
+        mlog_exit(status);
+        return status;
+}
+/*
+ * ocfs2_open_unlock unlock PR and EX mode open locks.
+ */
+void ocfs2_open_unlock(struct inode *inode)
+{
+        struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        mlog_entry_void();
+        mlog(0, "inode %llu drop open lock\n",
+             (unsigned long long)OCFS2_I(inode)->ip_blkno);
+        if (ocfs2_mount_local(osb))
+                goto out;
+        if(lockres->l_ro_holders)
+                ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
+                                     LKM_PRMODE);
+        if(lockres->l_ex_holders)
+                ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
+                                     LKM_EXMODE);
+out:
+        mlog_exit_void();
+}
 int ocfs2_data_lock_full(struct inode *inode,
                         int write,
                         int arg_flags)
@@ -1387,8 +1495,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
        if (S_ISLNK(inode->i_mode) && !oi->ip_clusters)
                inode->i_blocks = 0;
        else
-                inode->i_blocks =
+                inode->i_blocks = ocfs2_inode_sector_count(inode);
-                        ocfs2_align_bytes_to_sectors(i_size_read(inode));
        inode->i_uid     = be32_to_cpu(lvb->lvb_iuid);
        inode->i_gid     = be32_to_cpu(lvb->lvb_igid);
@@ -1479,12 +1586,15 @@ static int ocfs2_meta_lock_update(struct inode *inode,
 {
        int status = 0;
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        struct ocfs2_lock_res *lockres = NULL;
+        struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
        struct ocfs2_dinode *fe;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        mlog_entry_void();
+        if (ocfs2_mount_local(osb))
+                goto bail;
        spin_lock(&oi->ip_lock);
        if (oi->ip_flags & OCFS2_INODE_DELETED) {
                mlog(0, "Orphaned inode %llu was deleted while we "
@@ -1496,22 +1606,16 @@ static int ocfs2_meta_lock_update(struct inode *inode,
        }
        spin_unlock(&oi->ip_lock);
-        if (!ocfs2_mount_local(osb)) {
+        if (!ocfs2_should_refresh_lock_res(lockres))
-                lockres = &oi->ip_meta_lockres;
+                goto bail;
-                if (!ocfs2_should_refresh_lock_res(lockres))
-                        goto bail;
-        }
        /* This will discard any caching information we might have had
         * for the inode metadata. */
        ocfs2_metadata_cache_purge(inode);
-        /* will do nothing for inode types that don't use the extent
-         * map (directories, bitmap files, etc) */
        ocfs2_extent_map_trunc(inode, 0);
-        if (lockres && ocfs2_meta_lvb_is_trustable(inode, lockres)) {
+        if (ocfs2_meta_lvb_is_trustable(inode, lockres)) {
                mlog(0, "Trusting LVB on inode %llu\n",
                     (unsigned long long)oi->ip_blkno);
                ocfs2_refresh_inode_from_lvb(inode);
@@ -1558,8 +1662,7 @@ static int ocfs2_meta_lock_update(struct inode *inode,
        status = 0;
 bail_refresh:
-        if (lockres)
+        ocfs2_complete_lock_res_refresh(lockres, status);
-                ocfs2_complete_lock_res_refresh(lockres, status);
 bail:
        mlog_exit(status);
        return status;
@@ -1630,7 +1733,6 @@ int ocfs2_meta_lock_full(struct inode *inode,
                wait_event(osb->recovery_event,
                           ocfs2_node_map_is_empty(osb, &osb->recovery_map));
-        acquired = 0;
        lockres = &OCFS2_I(inode)->ip_meta_lockres;
        level = ex ? LKM_EXMODE : LKM_PRMODE;
        dlm_flags = 0;
@@ -2458,13 +2560,20 @@ int ocfs2_drop_inode_locks(struct inode *inode)
         * ocfs2_clear_inode has done it for us. */
        err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
-                              &OCFS2_I(inode)->ip_data_lockres);
+                              &OCFS2_I(inode)->ip_open_lockres);
        if (err < 0)
                mlog_errno(err);
        status = err;
        err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
+                              &OCFS2_I(inode)->ip_data_lockres);
+        if (err < 0)
+                mlog_errno(err);
+        if (err < 0 && !status)
+                status = err;
+        err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
                              &OCFS2_I(inode)->ip_meta_lockres);
        if (err < 0)
                mlog_errno(err);
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index c343fca68cf1..59cb566e7983 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -80,6 +80,9 @@ void ocfs2_data_unlock(struct inode *inode,
                       int write);
 int ocfs2_rw_lock(struct inode *inode, int write);
 void ocfs2_rw_unlock(struct inode *inode, int write);
+int ocfs2_open_lock(struct inode *inode);
+int ocfs2_try_open_lock(struct inode *inode, int write);
+void ocfs2_open_unlock(struct inode *inode);
 int ocfs2_meta_lock_atime(struct inode *inode,
                          struct vfsmount *vfsmnt,
                          int *level);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 80ac69f11d9f..ba2b2ab1c6e4 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -3,8 +3,7 @@
 *
 * extent_map.c
 *
- * In-memory extent map for OCFS2.  Man, this code was prettier in
+ * Block/Cluster mapping functions
- * the library.
 *
 * Copyright (C) 2004 Oracle.  All rights reserved.
 *
@@ -26,1016 +25,528 @@
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/rbtree.h>
 #define MLOG_MASK_PREFIX ML_EXTENT_MAP
 #include <cluster/masklog.h>
 #include "ocfs2.h"
+#include "alloc.h"
 #include "extent_map.h"
 #include "inode.h"
 #include "super.h"
 #include "buffer_head_io.h"
 /*
- * SUCK SUCK SUCK
+ * The extent caching implementation is intentionally trivial.
- * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h
- */
-struct ocfs2_extent_map_entry {
-        struct rb_node e_node;
-        int e_tree_depth;
-        struct ocfs2_extent_rec e_rec;
-};
-struct ocfs2_em_insert_context {
-        int need_left;
-        int need_right;
-        struct ocfs2_extent_map_entry *new_ent;
-        struct ocfs2_extent_map_entry *old_ent;
-        struct ocfs2_extent_map_entry *left_ent;
-        struct ocfs2_extent_map_entry *right_ent;
-};
-static struct kmem_cache *ocfs2_em_ent_cachep = NULL;
-static struct ocfs2_extent_map_entry *
-ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
-                        u32 cpos, u32 clusters,
-                        struct rb_node ***ret_p,
-                        struct rb_node **ret_parent);
-static int ocfs2_extent_map_insert(struct inode *inode,
-                                   struct ocfs2_extent_rec *rec,
-                                   int tree_depth);
-static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
-                                         struct ocfs2_extent_map_entry *ent);
-static int ocfs2_extent_map_find_leaf(struct inode *inode,
-                                      u32 cpos, u32 clusters,
-                                      struct ocfs2_extent_list *el);
-static int ocfs2_extent_map_lookup_read(struct inode *inode,
-                                        u32 cpos, u32 clusters,
-                                        struct ocfs2_extent_map_entry **ret_ent);
-static int ocfs2_extent_map_try_insert(struct inode *inode,
-                                       struct ocfs2_extent_rec *rec,
-                                       int tree_depth,
-                                       struct ocfs2_em_insert_context *ctxt);
-/* returns 1 only if the rec contains all the given clusters -- that is that
- * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos +
- * clusters) is >= the argument's endpoint */
-static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec,
-                                              u32 cpos, u32 clusters)
-{
-        if (le32_to_cpu(rec->e_cpos) > cpos)
-                return 0;
-        if (cpos + clusters > le32_to_cpu(rec->e_cpos) + 
-                              le32_to_cpu(rec->e_clusters))
-                return 0;
-        return 1;
-}
-/*
- * Find an entry in the tree that intersects the region passed in.
- * Note that this will find straddled intervals, it is up to the
- * callers to enforce any boundary conditions.
- *
- * Callers must hold ip_lock.  This lookup is not guaranteed to return
- * a tree_depth 0 match, and as such can race inserts if the lock
- * were not held.
 *
- * The rb_node garbage lets insertion share the search.  Trivial
+ * We only cache a small number of extents stored directly on the
- * callers pass NULL.
+ * inode, so linear order operations are acceptable. If we ever want
+ * to increase the size of the extent map, then these algorithms must
+ * get smarter.
 */
-static struct ocfs2_extent_map_entry *
-ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
+void ocfs2_extent_map_init(struct inode *inode)
-                        u32 cpos, u32 clusters,
-                        struct rb_node ***ret_p,
-                        struct rb_node **ret_parent)
 {
-        struct rb_node **p = &em->em_extents.rb_node;
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        struct rb_node *parent = NULL;
-        struct ocfs2_extent_map_entry *ent = NULL;
-        while (*p)
-        {
-                parent = *p;
-                ent = rb_entry(parent, struct ocfs2_extent_map_entry,
-                               e_node);
-                if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) {
-                        p = &(*p)->rb_left;
-                        ent = NULL;
-                } else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) +
-                                    le32_to_cpu(ent->e_rec.e_clusters))) {
-                        p = &(*p)->rb_right;
-                        ent = NULL;
-                } else
-                        break;
-        }
-        if (ret_p != NULL)
+        oi->ip_extent_map.em_num_items = 0;
-                *ret_p = p;
+        INIT_LIST_HEAD(&oi->ip_extent_map.em_list);
-        if (ret_parent != NULL)
-                *ret_parent = parent;
-        return ent;
 }
-/*
+static void __ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
- * Find the leaf containing the interval we want.  While we're on our
+                                      unsigned int cpos,
- * way down the tree, fill in every record we see at any depth, because
+                                      struct ocfs2_extent_map_item **ret_emi)
- * we might want it later.
- *
- * Note that this code is run without ip_lock.  That's because it
- * sleeps while reading.  If someone is also filling the extent list at
- * the same time we are, we might have to restart.
- */
-static int ocfs2_extent_map_find_leaf(struct inode *inode,
-                                      u32 cpos, u32 clusters,
-                                      struct ocfs2_extent_list *el)
 {
-        int i, ret;
+        unsigned int range;
-        struct buffer_head *eb_bh = NULL;
+        struct ocfs2_extent_map_item *emi;
-        u64 blkno;
-        u32 rec_end;
-        struct ocfs2_extent_block *eb;
-        struct ocfs2_extent_rec *rec;
-        /*
-         * The bh data containing the el cannot change here, because
-         * we hold alloc_sem.  So we can do this without other
-         * locks.
-         */
-        while (el->l_tree_depth)
-        {
-                blkno = 0;
-                for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
-                        rec = &el->l_recs[i];
-                        rec_end = (le32_to_cpu(rec->e_cpos) +
-                                   le32_to_cpu(rec->e_clusters));
-                        ret = -EBADR;
-                        if (rec_end > OCFS2_I(inode)->ip_clusters) {
-                                mlog_errno(ret);
-                                ocfs2_error(inode->i_sb,
-                                            "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n",
-                                            i,
-                                            (unsigned long long)le64_to_cpu(rec->e_blkno),
-                                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                                            OCFS2_I(inode)->ip_clusters);
-                                goto out_free;
-                        }
-                        if (rec_end <= cpos) {
-                                ret = ocfs2_extent_map_insert(inode, rec,
-                                                le16_to_cpu(el->l_tree_depth));
-                                if (ret && (ret != -EEXIST)) {
-                                        mlog_errno(ret);
-                                        goto out_free;
-                                }
-                                continue;
-                        }
-                        if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) {
-                                ret = ocfs2_extent_map_insert(inode, rec,
-                                                le16_to_cpu(el->l_tree_depth));
-                                if (ret && (ret != -EEXIST)) {
-                                        mlog_errno(ret);
-                                        goto out_free;
-                                }
-                                continue;
-                        }
-                        /*
+        *ret_emi = NULL;
-                         * We've found a record that matches our
-                         * interval.  We don't insert it because we're
-                         * about to traverse it.
-                         */
-                        /* Check to see if we're stradling */
-                        ret = -ESRCH;
-                        if (!ocfs2_extent_rec_contains_clusters(rec,
-                                                                cpos,
-                                                                clusters)) {
-                                mlog_errno(ret);
-                                goto out_free;
-                        }
-                        /*
+        list_for_each_entry(emi, &em->em_list, ei_list) {
-                         * If we've already found a record, the el has
+                range = emi->ei_cpos + emi->ei_clusters;
-                         * two records covering the same interval.
-                         * EEEK!
-                         */
-                        ret = -EBADR;
-                        if (blkno) {
-                                mlog_errno(ret);
-                                ocfs2_error(inode->i_sb,
-                                            "Multiple extents for (cpos = %u, clusters = %u) on inode %llu; e_blkno %llu and rec %d at e_blkno %llu\n",
-                                            cpos, clusters,
-                                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                                            (unsigned long long)blkno, i,
-                                            (unsigned long long)le64_to_cpu(rec->e_blkno));
-                                goto out_free;
-                        }
-                        blkno = le64_to_cpu(rec->e_blkno);
+                if (cpos >= emi->ei_cpos && cpos < range) {
-                }
+                        list_move(&emi->ei_list, &em->em_list);
-                /*
+                        *ret_emi = emi;
-                 * We don't support holes, and we're still up
+                        break;
-                 * in the branches, so we'd better have found someone
-                 */
-                ret = -EBADR;
-                if (!blkno) {
-                        ocfs2_error(inode->i_sb,
-                                    "No record found for (cpos = %u, clusters = %u) on inode %llu\n",
-                                    cpos, clusters,
-                                    (unsigned long long)OCFS2_I(inode)->ip_blkno);
-                        mlog_errno(ret);
-                        goto out_free;
-                }
-                if (eb_bh) {
-                        brelse(eb_bh);
-                        eb_bh = NULL;
-                }
-                ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-                                       blkno, &eb_bh, OCFS2_BH_CACHED,
-                                       inode);
-                if (ret) {
-                        mlog_errno(ret);
-                        goto out_free;
-                }
-                eb = (struct ocfs2_extent_block *)eb_bh->b_data;
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-                        ret = -EIO;
-                        goto out_free;
                }
-                el = &eb->h_list;
        }
+}
-        BUG_ON(el->l_tree_depth);
+static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos,
+                                   unsigned int *phys, unsigned int *len,
-        for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+                                   unsigned int *flags)
-                rec = &el->l_recs[i];
+{
+        unsigned int coff;
-                if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-                    OCFS2_I(inode)->ip_clusters) {
+        struct ocfs2_extent_map_item *emi;
-                        ret = -EBADR;
-                        mlog_errno(ret);
+        spin_lock(&oi->ip_lock);
-                        ocfs2_error(inode->i_sb,
-                                    "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n",
+        __ocfs2_extent_map_lookup(&oi->ip_extent_map, cpos, &emi);
-                                    i,
+        if (emi) {
-                                    (unsigned long long)le64_to_cpu(rec->e_blkno),
+                coff = cpos - emi->ei_cpos;
-                                    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                *phys = emi->ei_phys + coff;
-                                    OCFS2_I(inode)->ip_clusters);
+                if (len)
-                        return ret;
+                        *len = emi->ei_clusters - coff;
-                }
+                if (flags)
+                        *flags = emi->ei_flags;
-                ret = ocfs2_extent_map_insert(inode, rec,
-                                              le16_to_cpu(el->l_tree_depth));
-                if (ret && (ret != -EEXIST)) {
-                        mlog_errno(ret);
-                        goto out_free;
-                }
        }
-        ret = 0;
+        spin_unlock(&oi->ip_lock);
-out_free:
+        if (emi == NULL)
-        if (eb_bh)
+                return -ENOENT;
-                brelse(eb_bh);
-        return ret;
+        return 0;
 }
 /*
- * This lookup actually will read from disk.  It has one invariant:
+ * Forget about all clusters equal to or greater than cpos.
- * It will never re-traverse blocks.  This means that all inserts should
- * be new regions or more granular regions (both allowed by insert).
 */
-static int ocfs2_extent_map_lookup_read(struct inode *inode,
+void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)
-                                        u32 cpos,
-                                        u32 clusters,
-                                        struct ocfs2_extent_map_entry **ret_ent)
 {
-        int ret;
+        struct list_head *p, *n;
-        u64 blkno;
+        struct ocfs2_extent_map_item *emi;
-        struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        struct ocfs2_extent_map_entry *ent;
+        struct ocfs2_extent_map *em = &oi->ip_extent_map;
-        struct buffer_head *bh = NULL;
+        LIST_HEAD(tmp_list);
-        struct ocfs2_extent_block *eb;
+        unsigned int range;
-        struct ocfs2_dinode *di;
-        struct ocfs2_extent_list *el;
+        spin_lock(&oi->ip_lock);
+        list_for_each_safe(p, n, &em->em_list) {
-        spin_lock(&OCFS2_I(inode)->ip_lock);
+                emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
-        ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
-        if (ent) {
+                if (emi->ei_cpos >= cpos) {
-                if (!ent->e_tree_depth) {
+                        /* Full truncate of this record. */
-                        spin_unlock(&OCFS2_I(inode)->ip_lock);
+                        list_move(&emi->ei_list, &tmp_list);
-                        *ret_ent = ent;
+                        BUG_ON(em->em_num_items == 0);
-                        return 0;
+                        em->em_num_items--;
-                }
+                        continue;
-                blkno = le64_to_cpu(ent->e_rec.e_blkno);
-                spin_unlock(&OCFS2_I(inode)->ip_lock);
-                ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh,
-                                       OCFS2_BH_CACHED, inode);
-                if (ret) {
-                        mlog_errno(ret);
-                        if (bh)
-                                brelse(bh);
-                        return ret;
                }
-                eb = (struct ocfs2_extent_block *)bh->b_data;
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-                        brelse(bh);
-                        return -EIO;
-                }
-                el = &eb->h_list;
-        } else {
-                spin_unlock(&OCFS2_I(inode)->ip_lock);
-                ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+                range = emi->ei_cpos + emi->ei_clusters;
-                                       OCFS2_I(inode)->ip_blkno, &bh,
+                if (range > cpos) {
-                                       OCFS2_BH_CACHED, inode);
+                        /* Partial truncate */
-                if (ret) {
+                        emi->ei_clusters = cpos - emi->ei_cpos;
-                        mlog_errno(ret);
-                        if (bh)
-                                brelse(bh);
-                        return ret;
                }
-                di = (struct ocfs2_dinode *)bh->b_data;
-                if (!OCFS2_IS_VALID_DINODE(di)) {
-                        brelse(bh);
-                        OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di);
-                        return -EIO;
-                }
-                el = &di->id2.i_list;
-        }
-        ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el);
-        brelse(bh);
-        if (ret) {
-                mlog_errno(ret);
-                return ret;
        }
+        spin_unlock(&oi->ip_lock);
-        ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
+        list_for_each_safe(p, n, &tmp_list) {
-        if (!ent) {
+                emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
-                ret = -ESRCH;
+                list_del(&emi->ei_list);
-                mlog_errno(ret);
+                kfree(emi);
-                return ret;
        }
-        /* FIXME: Make sure this isn't a corruption */
-        BUG_ON(ent->e_tree_depth);
-        *ret_ent = ent;
-        return 0;
 }
 /*
- * Callers must hold ip_lock.  This can insert pieces of the tree,
+ * Is any part of emi2 contained within emi1
- * thus racing lookup if the lock weren't held.
 */
-static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
+static int ocfs2_ei_is_contained(struct ocfs2_extent_map_item *emi1,
-                                         struct ocfs2_extent_map_entry *ent)
+                                 struct ocfs2_extent_map_item *emi2)
 {
-        struct rb_node **p, *parent;
+        unsigned int range1, range2;
-        struct ocfs2_extent_map_entry *old_ent;
-        old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos),
+        /*
-                                          le32_to_cpu(ent->e_rec.e_clusters),
+         * Check if logical start of emi2 is inside emi1
-                                          &p, &parent);
+         */
-        if (old_ent)
+        range1 = emi1->ei_cpos + emi1->ei_clusters;
-                return -EEXIST;
+        if (emi2->ei_cpos >= emi1->ei_cpos && emi2->ei_cpos < range1)
+                return 1;
-        rb_link_node(&ent->e_node, parent, p);
+        /*
-        rb_insert_color(&ent->e_node, &em->em_extents);
+         * Check if logical end of emi2 is inside emi1
+         */
+        range2 = emi2->ei_cpos + emi2->ei_clusters;
+        if (range2 > emi1->ei_cpos && range2 <= range1)
+                return 1;
        return 0;
 }
+static void ocfs2_copy_emi_fields(struct ocfs2_extent_map_item *dest,
+                                  struct ocfs2_extent_map_item *src)
+{
+        dest->ei_cpos = src->ei_cpos;
+        dest->ei_phys = src->ei_phys;
+        dest->ei_clusters = src->ei_clusters;
+        dest->ei_flags = src->ei_flags;
+}
 /*
- * Simple rule: on any return code other than -EAGAIN, anything left
+ * Try to merge emi with ins. Returns 1 if merge succeeds, zero
- * in the insert_context will be freed.
+ * otherwise.
- *
- * Simple rule #2: A return code of -EEXIST from this function or
- * its calls to ocfs2_extent_map_insert_entry() signifies that another
- * thread beat us to the insert.  It is not an actual error, but it
- * tells the caller we have no more work to do.
 */
-static int ocfs2_extent_map_try_insert(struct inode *inode,
+static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi,
-                                       struct ocfs2_extent_rec *rec,
+                                         struct ocfs2_extent_map_item *ins)
-                                       int tree_depth,
-                                       struct ocfs2_em_insert_context *ctxt)
 {
-        int ret;
-        struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-        struct ocfs2_extent_map_entry *old_ent;
-        ctxt->need_left = 0;
-        ctxt->need_right = 0;
-        ctxt->old_ent = NULL;
-        spin_lock(&OCFS2_I(inode)->ip_lock);
-        ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
-        if (!ret) {
-                ctxt->new_ent = NULL;
-                goto out_unlock;
-        }
-        /* Since insert_entry failed, the map MUST have old_ent */
-        old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos),
-                                          le32_to_cpu(rec->e_clusters),
-                                          NULL, NULL);
-        BUG_ON(!old_ent);
-        if (old_ent->e_tree_depth < tree_depth) {
-                /* Another thread beat us to the lower tree_depth */
-                ret = -EEXIST;
-                goto out_unlock;
-        }
-        if (old_ent->e_tree_depth == tree_depth) {
-                /*
-                 * Another thread beat us to this tree_depth.
-                 * Let's make sure we agree with that thread (the
-                 * extent_rec should be identical).
-                 */
-                if (!memcmp(rec, &old_ent->e_rec,
-                            sizeof(struct ocfs2_extent_rec)))
-                        ret = 0;
-                else
-                        /* FIXME: Should this be ESRCH/EBADR??? */
-                        ret = -EEXIST;
-                goto out_unlock;
-        }
        /*
-         * We do it in this order specifically so that no actual tree
+         * Handle contiguousness
-         * changes occur until we have all the pieces we need.  We
-         * don't want malloc failures to leave an inconsistent tree.
-         * Whenever we drop the lock, another process could be
-         * inserting.  Also note that, if another process just beat us
-         * to an insert, we might not need the same pieces we needed
-         * the first go round.  In the end, the pieces we need will
-         * be used, and the pieces we don't will be freed.
         */
-        ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) >
+        if (ins->ei_phys == (emi->ei_phys + emi->ei_clusters) &&
-                             le32_to_cpu(old_ent->e_rec.e_cpos));
+            ins->ei_cpos == (emi->ei_cpos + emi->ei_clusters) &&
-        ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) +
+            ins->ei_flags == emi->ei_flags) {
-                               le32_to_cpu(old_ent->e_rec.e_clusters)) >
+                emi->ei_clusters += ins->ei_clusters;
-                              (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)));
+                return 1;
-        ret = -EAGAIN;
+        } else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys &&
-        if (ctxt->need_left) {
+                   (ins->ei_cpos + ins->ei_clusters) == emi->ei_phys &&
-                if (!ctxt->left_ent)
+                   ins->ei_flags == emi->ei_flags) {
-                        goto out_unlock;
+                emi->ei_phys = ins->ei_phys;
-                *(ctxt->left_ent) = *old_ent;
+                emi->ei_cpos = ins->ei_cpos;
-                ctxt->left_ent->e_rec.e_clusters =
+                emi->ei_clusters += ins->ei_clusters;
-                        cpu_to_le32(le32_to_cpu(rec->e_cpos) -
+                return 1;
-                                    le32_to_cpu(ctxt->left_ent->e_rec.e_cpos));
-        }
-        if (ctxt->need_right) {
-                if (!ctxt->right_ent)
-                        goto out_unlock;
-                *(ctxt->right_ent) = *old_ent;
-                ctxt->right_ent->e_rec.e_cpos =
-                        cpu_to_le32(le32_to_cpu(rec->e_cpos) +
-                                    le32_to_cpu(rec->e_clusters));
-                ctxt->right_ent->e_rec.e_clusters =
-                        cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) +
-                                     le32_to_cpu(old_ent->e_rec.e_clusters)) -
-                                    le32_to_cpu(ctxt->right_ent->e_rec.e_cpos));
-        }
-        rb_erase(&old_ent->e_node, &em->em_extents);
-        /* Now that he's erased, set him up for deletion */
-        ctxt->old_ent = old_ent;
-        if (ctxt->need_left) {
-                ret = ocfs2_extent_map_insert_entry(em,
-                                                    ctxt->left_ent);
-                if (ret)
-                        goto out_unlock;
-                ctxt->left_ent = NULL;
        }
-        if (ctxt->need_right) {
+        /*
-                ret = ocfs2_extent_map_insert_entry(em,
+         * Overlapping extents - this shouldn't happen unless we've
-                                                    ctxt->right_ent);
+         * split an extent to change it's flags. That is exceedingly
-                if (ret)
+         * rare, so there's no sense in trying to optimize it yet.
-                        goto out_unlock;
+         */
-                ctxt->right_ent = NULL;
+        if (ocfs2_ei_is_contained(emi, ins) ||
+            ocfs2_ei_is_contained(ins, emi)) {
+                ocfs2_copy_emi_fields(emi, ins);
+                return 1;
        }
-        ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
+        /* No merge was possible. */
+        return 0;
-        if (!ret)
-                ctxt->new_ent = NULL;
-out_unlock:
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
-        return ret;
 }
+/*
-static int ocfs2_extent_map_insert(struct inode *inode,
+ * In order to reduce complexity on the caller, this insert function
-                                   struct ocfs2_extent_rec *rec,
+ * is intentionally liberal in what it will accept.
-                                   int tree_depth)
+ *
+ * The only rule is that the truncate call *must* be used whenever
+ * records have been deleted. This avoids inserting overlapping
+ * records with different physical mappings.
+ */
+void ocfs2_extent_map_insert_rec(struct inode *inode,
+                                 struct ocfs2_extent_rec *rec)
 {
-        int ret;
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        struct ocfs2_em_insert_context ctxt = {0, };
+        struct ocfs2_extent_map *em = &oi->ip_extent_map;
+        struct ocfs2_extent_map_item *emi, *new_emi = NULL;
-        if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
+        struct ocfs2_extent_map_item ins;
-            OCFS2_I(inode)->ip_map.em_clusters) {
-                ret = -EBADR;
+        ins.ei_cpos = le32_to_cpu(rec->e_cpos);
-                mlog_errno(ret);
+        ins.ei_phys = ocfs2_blocks_to_clusters(inode->i_sb,
-                return ret;
+                                               le64_to_cpu(rec->e_blkno));
+        ins.ei_clusters = le16_to_cpu(rec->e_leaf_clusters);
+        ins.ei_flags = rec->e_flags;
+search:
+        spin_lock(&oi->ip_lock);
+        list_for_each_entry(emi, &em->em_list, ei_list) {
+                if (ocfs2_try_to_merge_extent_map(emi, &ins)) {
+                        list_move(&emi->ei_list, &em->em_list);
+                        spin_unlock(&oi->ip_lock);
+                        goto out;
+                }
        }
-        /* Zero e_clusters means a truncated tail record.  It better be EOF */
+        /*
-        if (!rec->e_clusters) {
+         * No item could be merged.
-                if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) !=
+         *
-                    OCFS2_I(inode)->ip_map.em_clusters) {
+         * Either allocate and add a new item, or overwrite the last recently
-                        ret = -EBADR;
+         * inserted.
-                        mlog_errno(ret);
+         */
-                        ocfs2_error(inode->i_sb,
-                                    "Zero e_clusters on non-tail extent record at e_blkno %llu on inode %llu\n",
-                                    (unsigned long long)le64_to_cpu(rec->e_blkno),
-                                    (unsigned long long)OCFS2_I(inode)->ip_blkno);
-                        return ret;
-                }
-                /* Ignore the truncated tail */
+        if (em->em_num_items < OCFS2_MAX_EXTENT_MAP_ITEMS) {
-                return 0;
+                if (new_emi == NULL) {
-        }
+                        spin_unlock(&oi->ip_lock);
-        ret = -ENOMEM;
+                        new_emi = kmalloc(sizeof(*new_emi), GFP_NOFS);
-        ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep,
+                        if (new_emi == NULL)
-                                        GFP_NOFS);
+                                goto out;
-        if (!ctxt.new_ent) {
-                mlog_errno(ret);
-                return ret;
-        }
-        ctxt.new_ent->e_rec = *rec;
+                        goto search;
-        ctxt.new_ent->e_tree_depth = tree_depth;
-        do {
-                ret = -ENOMEM;
-                if (ctxt.need_left && !ctxt.left_ent) {
-                        ctxt.left_ent =
-                                kmem_cache_alloc(ocfs2_em_ent_cachep,
-                                                 GFP_NOFS);
-                        if (!ctxt.left_ent)
-                                break;
-                }
-                if (ctxt.need_right && !ctxt.right_ent) {
-                        ctxt.right_ent =
-                                kmem_cache_alloc(ocfs2_em_ent_cachep,
-                                                 GFP_NOFS);
-                        if (!ctxt.right_ent)
-                                break;
                }
-                ret = ocfs2_extent_map_try_insert(inode, rec,
+                ocfs2_copy_emi_fields(new_emi, &ins);
-                                                  tree_depth, &ctxt);
+                list_add(&new_emi->ei_list, &em->em_list);
-        } while (ret == -EAGAIN);
+                em->em_num_items++;
+                new_emi = NULL;
-        if ((ret < 0) && (ret != -EEXIST))
+        } else {
-                mlog_errno(ret);
+                BUG_ON(list_empty(&em->em_list) || em->em_num_items == 0);
+                emi = list_entry(em->em_list.prev,
+                                 struct ocfs2_extent_map_item, ei_list);
+                list_move(&emi->ei_list, &em->em_list);
+                ocfs2_copy_emi_fields(emi, &ins);
+        }
-        if (ctxt.left_ent)
+        spin_unlock(&oi->ip_lock);
-                kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent);
-        if (ctxt.right_ent)
-                kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent);
-        if (ctxt.old_ent)
-                kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent);
-        if (ctxt.new_ent)
-                kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent);
-        return ret;
+out:
+        if (new_emi)
+                kfree(new_emi);
 }
 /*
- * Append this record to the tail of the extent map.  It must be
+ * Return the 1st index within el which contains an extent start
- * tree_depth 0.  The record might be an extension of an existing
+ * larger than v_cluster.
- * record, and as such that needs to be handled.  eg:
- *
- * Existing record in the extent map:
- *
- *      cpos = 10, len = 10
- *      |---------|
- *
- * New Record:
- *
- *      cpos = 10, len = 20
- *      |------------------|
- *
- * The passed record is the new on-disk record.  The new_clusters value
- * is how many clusters were added to the file.  If the append is a
- * contiguous append, the new_clusters has been added to
- * rec->e_clusters.  If the append is an entirely new extent, then
- * rec->e_clusters is == new_clusters.
 */
-int ocfs2_extent_map_append(struct inode *inode,
+static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el,
-                            struct ocfs2_extent_rec *rec,
+                                       u32 v_cluster)
-                            u32 new_clusters)
 {
-        int ret;
+        int i;
-        struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+        struct ocfs2_extent_rec *rec;
-        struct ocfs2_extent_map_entry *ent;
-        struct ocfs2_extent_rec *old;
-        BUG_ON(!new_clusters);
-        BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters);
-        if (em->em_clusters < OCFS2_I(inode)->ip_clusters) {
+        for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
-                /*
+                rec = &el->l_recs[i];
-                 * Size changed underneath us on disk.  Drop any
-                 * straddling records and update our idea of
-                 * i_clusters
-                 */
-                ocfs2_extent_map_drop(inode, em->em_clusters - 1);
-                em->em_clusters = OCFS2_I(inode)->ip_clusters;
-        }
-        mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) +
+                if (v_cluster < le32_to_cpu(rec->e_cpos))
-                         le32_to_cpu(rec->e_clusters)) !=
+                        break;
-                        (em->em_clusters + new_clusters),
-                        "Inode %llu:\n"
-                        "rec->e_cpos = %u + rec->e_clusters = %u = %u\n"
-                        "em->em_clusters = %u + new_clusters = %u = %u\n",
-                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                        le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
-                        le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters),
-                        em->em_clusters, new_clusters,
-                        em->em_clusters + new_clusters);
-        em->em_clusters += new_clusters;
-        ret = -ENOENT;
-        if (le32_to_cpu(rec->e_clusters) > new_clusters) {
-                /* This is a contiguous append */
-                ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1,
-                                              NULL, NULL);
-                if (ent) {
-                        old = &ent->e_rec;
-                        BUG_ON((le32_to_cpu(rec->e_cpos) +
-                                le32_to_cpu(rec->e_clusters)) !=
-                                 (le32_to_cpu(old->e_cpos) +
-                                  le32_to_cpu(old->e_clusters) +
-                                  new_clusters));
-                        if (ent->e_tree_depth == 0) {
-                                BUG_ON(le32_to_cpu(old->e_cpos) !=
-                                       le32_to_cpu(rec->e_cpos));
-                                BUG_ON(le64_to_cpu(old->e_blkno) !=
-                                       le64_to_cpu(rec->e_blkno));
-                                ret = 0;
-                        }
-                        /*
-                         * Let non-leafs fall through as -ENOENT to
-                         * force insertion of the new leaf.
-                         */
-                        le32_add_cpu(&old->e_clusters, new_clusters);
-                }
        }
-        if (ret == -ENOENT)
+        return i;
-                ret = ocfs2_extent_map_insert(inode, rec, 0);
-        if (ret < 0)
-                mlog_errno(ret);
-        return ret;
 }
-#if 0
-/* Code here is included but defined out as it completes the extent
- * map api and may be used in the future. */
 /*
- * Look up the record containing this cluster offset.  This record is
+ * Figure out the size of a hole which starts at v_cluster within the given
- * part of the extent map.  Do not free it.  Any changes you make to
+ * extent list.
- * it will reflect in the extent map.  So, if your last extent
- * is (cpos = 10, clusters = 10) and you truncate the file by 5
- * clusters, you can do:
 *
- * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec);
+ * If there is no more allocation past v_cluster, we return the maximum
- * rec->e_clusters -= 5;
+ * cluster size minus v_cluster.
 *
- * The lookup does not read from disk.  If the map isn't filled in for
+ * If we have in-inode extents, then el points to the dinode list and
- * an entry, you won't find it.
+ * eb_bh is NULL. Otherwise, eb_bh should point to the extent block
- *
+ * containing el.
- * Also note that the returned record is valid until alloc_sem is
- * dropped.  After that, truncate and extend can happen.  Caveat Emptor.
 */
-int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos,
+static int ocfs2_figure_hole_clusters(struct inode *inode,
-                             struct ocfs2_extent_rec **rec,
+                                      struct ocfs2_extent_list *el,
-                             int *tree_depth)
+                                      struct buffer_head *eb_bh,
+                                      u32 v_cluster,
+                                      u32 *num_clusters)
 {
-        int ret = -ENOENT;
+        int ret, i;
-        struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+        struct buffer_head *next_eb_bh = NULL;
-        struct ocfs2_extent_map_entry *ent;
+        struct ocfs2_extent_block *eb, *next_eb;
-        *rec = NULL;
+        i = ocfs2_search_for_hole_index(el, v_cluster);
-        if (cpos >= OCFS2_I(inode)->ip_clusters)
+        if (i == le16_to_cpu(el->l_next_free_rec) && eb_bh) {
-                return -EINVAL;
+                eb = (struct ocfs2_extent_block *)eb_bh->b_data;
-        if (cpos >= em->em_clusters) {
                /*
-                 * Size changed underneath us on disk.  Drop any
+                 * Check the next leaf for any extents.
-                 * straddling records and update our idea of
-                 * i_clusters
                 */
-                ocfs2_extent_map_drop(inode, em->em_clusters - 1);
-                em->em_clusters = OCFS2_I(inode)->ip_clusters ;
-        }
-        ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1,
-                                      NULL, NULL);
-        if (ent) {
+                if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
-                *rec = &ent->e_rec;
+                        goto no_more_extents;
-                if (tree_depth)
-                        *tree_depth = ent->e_tree_depth;
-                ret = 0;
-        }
-        return ret;
+                ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-}
+                                       le64_to_cpu(eb->h_next_leaf_blk),
+                                       &next_eb_bh, OCFS2_BH_CACHED, inode);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
-int ocfs2_extent_map_get_clusters(struct inode *inode,
+                if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb)) {
-                                  u32 v_cpos, int count,
+                        ret = -EROFS;
-                                  u32 *p_cpos, int *ret_count)
+                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, next_eb);
-{
+                        goto out;
-        int ret;
+                }
-        u32 coff, ccount;
-        struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-        struct ocfs2_extent_map_entry *ent = NULL;
-        *p_cpos = ccount = 0;
+                el = &next_eb->h_list;
-        if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters)
+                i = ocfs2_search_for_hole_index(el, v_cluster);
-                return -EINVAL;
+        }
-        if ((v_cpos + count) > em->em_clusters) {
+no_more_extents:
+        if (i == le16_to_cpu(el->l_next_free_rec)) {
                /*
-                 * Size changed underneath us on disk.  Drop any
+                 * We're at the end of our existing allocation. Just
-                 * straddling records and update our idea of
+                 * return the maximum number of clusters we could
-                 * i_clusters
+                 * possibly allocate.
                 */
-                ocfs2_extent_map_drop(inode, em->em_clusters - 1);
+                *num_clusters = UINT_MAX - v_cluster;
-                em->em_clusters = OCFS2_I(inode)->ip_clusters;
+        } else {
+                *num_clusters = le32_to_cpu(el->l_recs[i].e_cpos) - v_cluster;
        }
+        ret = 0;
+out:
+        brelse(next_eb_bh);
+        return ret;
+}
-        ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent);
+/*
-        if (ret)
+ * Return the index of the extent record which contains cluster #v_cluster.
-                return ret;
+ * -1 is returned if it was not found.
+ *
+ * Should work fine on interior and exterior nodes.
+ */
+static int ocfs2_search_extent_list(struct ocfs2_extent_list *el,
+                                    u32 v_cluster)
+{
+        int ret = -1;
+        int i;
+        struct ocfs2_extent_rec *rec;
+        u32 rec_end, rec_start, clusters;
-        if (ent) {
+        for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
-                /* We should never find ourselves straddling an interval */
+                rec = &el->l_recs[i];
-                if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec,
-                                                        v_cpos,
-                                                        count))
-                        return -ESRCH;
-                coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos);
+                rec_start = le32_to_cpu(rec->e_cpos);
-                *p_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
+                clusters = ocfs2_rec_clusters(el, rec);
-                                le64_to_cpu(ent->e_rec.e_blkno)) +
-                          coff;
-                if (ret_count)
+                rec_end = rec_start + clusters;
-                        *ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff;
-                return 0;
+                if (v_cluster >= rec_start && v_cluster < rec_end) {
+                        ret = i;
+                        break;
+                }
        }
+        return ret;
-        return -ENOENT;
 }
-#endif  /*  0  */
+int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
+                       u32 *p_cluster, u32 *num_clusters,
-int ocfs2_extent_map_get_blocks(struct inode *inode,
+                       unsigned int *extent_flags)
-                                u64 v_blkno, int count,
-                                u64 *p_blkno, int *ret_count)
 {
-        int ret;
+        int ret, i;
-        u64 boff;
+        unsigned int flags = 0;
-        u32 cpos, clusters;
+        struct buffer_head *di_bh = NULL;
-        int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+        struct buffer_head *eb_bh = NULL;
-        struct ocfs2_extent_map_entry *ent = NULL;
+        struct ocfs2_dinode *di;
-        struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+        struct ocfs2_extent_block *eb;
+        struct ocfs2_extent_list *el;
        struct ocfs2_extent_rec *rec;
+        u32 coff;
-        *p_blkno = 0;
+        ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
+                                      num_clusters, extent_flags);
-        cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
+        if (ret == 0)
-        clusters = ocfs2_blocks_to_clusters(inode->i_sb,
+                goto out;
-                                            (u64)count + bpc - 1);
-        if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) {
-                ret = -EINVAL;
-                mlog_errno(ret);
-                return ret;
-        }
-        if ((cpos + clusters) > em->em_clusters) {
-                /*
-                 * Size changed underneath us on disk.  Drop any
-                 * straddling records and update our idea of
-                 * i_clusters
-                 */
-                ocfs2_extent_map_drop(inode, em->em_clusters - 1);
-                em->em_clusters = OCFS2_I(inode)->ip_clusters;
-        }
-        ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent);
+        ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
+                               &di_bh, OCFS2_BH_CACHED, inode);
        if (ret) {
                mlog_errno(ret);
-                return ret;
+                goto out;
        }
-        if (ent)
+        di = (struct ocfs2_dinode *) di_bh->b_data;
-        {
+        el = &di->id2.i_list;
-                rec = &ent->e_rec;
-                /* We should never find ourselves straddling an interval */
+        if (el->l_tree_depth) {
-                if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) {
+                ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
-                        ret = -ESRCH;
+                if (ret) {
                        mlog_errno(ret);
-                        return ret;
+                        goto out;
                }
-                boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos -
+                eb = (struct ocfs2_extent_block *) eb_bh->b_data;
-                                                le32_to_cpu(rec->e_cpos));
+                el = &eb->h_list;
-                boff += (v_blkno & (u64)(bpc - 1));
-                *p_blkno = le64_to_cpu(rec->e_blkno) + boff;
-                if (ret_count) {
+                if (el->l_tree_depth) {
-                        *ret_count = ocfs2_clusters_to_blocks(inode->i_sb,
+                        ocfs2_error(inode->i_sb,
-                                        le32_to_cpu(rec->e_clusters)) - boff;
+                                    "Inode %lu has non zero tree depth in "
+                                    "leaf block %llu\n", inode->i_ino,
+                                    (unsigned long long)eb_bh->b_blocknr);
+                        ret = -EROFS;
+                        goto out;
                }
-                return 0;
        }
-        return -ENOENT;
+        i = ocfs2_search_extent_list(el, v_cluster);
-}
+        if (i == -1) {
+                /*
-int ocfs2_extent_map_init(struct inode *inode)
+                 * A hole was found. Return some canned values that
-{
+                 * callers can key on. If asked for, num_clusters will
-        struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+                 * be populated with the size of the hole.
+                 */
-        em->em_extents = RB_ROOT;
+                *p_cluster = 0;
-        em->em_clusters = 0;
+                if (num_clusters) {
+                        ret = ocfs2_figure_hole_clusters(inode, el, eb_bh,
-        return 0;
+                                                         v_cluster,
-}
+                                                         num_clusters);
+                        if (ret) {
-/* Needs the lock */
+                                mlog_errno(ret);
-static void __ocfs2_extent_map_drop(struct inode *inode,
+                                goto out;
-                                    u32 new_clusters,
+                        }
-                                    struct rb_node **free_head,
+                }
-                                    struct ocfs2_extent_map_entry **tail_ent)
+        } else {
-{
+                rec = &el->l_recs[i];
-        struct rb_node *node, *next;
-        struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-        struct ocfs2_extent_map_entry *ent;
-        *free_head = NULL;
+                BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
-        ent = NULL;
+                if (!rec->e_blkno) {
-        node = rb_last(&em->em_extents);
+                        ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
-        while (node)
+                                    "record (%u, %u, 0)", inode->i_ino,
-        {
+                                    le32_to_cpu(rec->e_cpos),
-                next = rb_prev(node);
+                                    ocfs2_rec_clusters(el, rec));
+                        ret = -EROFS;
+                        goto out;
+                }
-                ent = rb_entry(node, struct ocfs2_extent_map_entry,
+                coff = v_cluster - le32_to_cpu(rec->e_cpos);
-                               e_node);
-                if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters)
-                        break;
-                rb_erase(&ent->e_node, &em->em_extents);
+                *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
+                                                    le64_to_cpu(rec->e_blkno));
+                *p_cluster = *p_cluster + coff;
-                node->rb_right = *free_head;
+                if (num_clusters)
-                *free_head = node;
+                        *num_clusters = ocfs2_rec_clusters(el, rec) - coff;
-                ent = NULL;
+                flags = rec->e_flags;
-                node = next;
-        }
-        /* Do we have an entry straddling new_clusters? */
+                ocfs2_extent_map_insert_rec(inode, rec);
-        if (tail_ent) {
-                if (ent &&
-                    ((le32_to_cpu(ent->e_rec.e_cpos) +
-                      le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters))
-                        *tail_ent = ent;
-                else
-                        *tail_ent = NULL;
        }
-}
-static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head)
-{
-        struct rb_node *node;
-        struct ocfs2_extent_map_entry *ent;
-        while (free_head) {
+        if (extent_flags)
-                node = free_head;
+                *extent_flags = flags;
-                free_head = node->rb_right;
-                ent = rb_entry(node, struct ocfs2_extent_map_entry,
+out:
-                               e_node);
+        brelse(di_bh);
-                kmem_cache_free(ocfs2_em_ent_cachep, ent);
+        brelse(eb_bh);
-        }
+        return ret;
 }
 /*
- * Remove all entries past new_clusters, inclusive of an entry that
+ * This expects alloc_sem to be held. The allocation cannot change at
- * contains new_clusters.  This is effectively a cache forget.
+ * all while the map is in the process of being updated.
- *
- * If you want to also clip the last extent by some number of clusters,
- * you need to call ocfs2_extent_map_trunc().
- * This code does not check or modify ip_clusters.
 */
-int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters)
+int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
+                                u64 *ret_count, unsigned int *extent_flags)
 {
-        struct rb_node *free_head = NULL;
+        int ret;
-        struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+        int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
-        struct ocfs2_extent_map_entry *ent;
+        u32 cpos, num_clusters, p_cluster;
+        u64 boff = 0;
-        spin_lock(&OCFS2_I(inode)->ip_lock);
-        __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
+        cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
-        if (ent) {
+        ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters,
-                rb_erase(&ent->e_node, &em->em_extents);
+                                 extent_flags);
-                ent->e_node.rb_right = free_head;
+        if (ret) {
-                free_head = &ent->e_node;
+                mlog_errno(ret);
+                goto out;
        }
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
+        /*
+         * p_cluster == 0 indicates a hole.
-        if (free_head)
+         */
-                __ocfs2_extent_map_drop_cleanup(free_head);
+        if (p_cluster) {
+                boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
-        return 0;
+                boff += (v_blkno & (u64)(bpc - 1));
-}
+        }
-/*
- * Remove all entries past new_clusters and also clip any extent
- * straddling new_clusters, if there is one.  This does not check
- * or modify ip_clusters
- */
-int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters)
-{
-        struct rb_node *free_head = NULL;
-        struct ocfs2_extent_map_entry *ent = NULL;
-        spin_lock(&OCFS2_I(inode)->ip_lock);
-        __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
-        if (ent)
-                ent->e_rec.e_clusters = cpu_to_le32(new_clusters -
-                                               le32_to_cpu(ent->e_rec.e_cpos));
-        OCFS2_I(inode)->ip_map.em_clusters = new_clusters;
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
-        if (free_head)
-                __ocfs2_extent_map_drop_cleanup(free_head);
-        return 0;
-}
-int __init init_ocfs2_extent_maps(void)
+        *p_blkno = boff;
-{
-        ocfs2_em_ent_cachep =
-                kmem_cache_create("ocfs2_em_ent",
-                                  sizeof(struct ocfs2_extent_map_entry),
-                                  0, SLAB_HWCACHE_ALIGN, NULL, NULL);
-        if (!ocfs2_em_ent_cachep)
-                return -ENOMEM;
-        return 0;
+        if (ret_count) {
-}
+                *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
+                *ret_count -= v_blkno & (u64)(bpc - 1);
+        }
-void exit_ocfs2_extent_maps(void)
+out:
-{
+        return ret;
-        kmem_cache_destroy(ocfs2_em_ent_cachep);
 }
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index fa3745efa886..de91e3e41a22 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -25,22 +25,29 @@
 #ifndef _EXTENT_MAP_H
 #define _EXTENT_MAP_H
-int init_ocfs2_extent_maps(void);
+struct ocfs2_extent_map_item {
-void exit_ocfs2_extent_maps(void);
+        unsigned int                    ei_cpos;
+        unsigned int                    ei_phys;
+        unsigned int                    ei_clusters;
+        unsigned int                    ei_flags;
-/*
+        struct list_head                ei_list;
- * EVERY CALL here except _init, _trunc, and _drop expects alloc_sem
+};
- * to be held.  The allocation cannot change at all while the map is
- * in the process of being updated.
+#define OCFS2_MAX_EXTENT_MAP_ITEMS                      3
- */
+struct ocfs2_extent_map {
-int ocfs2_extent_map_init(struct inode *inode);
+        unsigned int                    em_num_items;
-int ocfs2_extent_map_append(struct inode *inode,
+        struct list_head                em_list;
-                            struct ocfs2_extent_rec *rec,
+};
-                            u32 new_clusters);
-int ocfs2_extent_map_get_blocks(struct inode *inode,
+void ocfs2_extent_map_init(struct inode *inode);
-                                u64 v_blkno, int count,
+void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cluster);
-                                u64 *p_blkno, int *ret_count);
+void ocfs2_extent_map_insert_rec(struct inode *inode,
-int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters);
+                                 struct ocfs2_extent_rec *rec);
-int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters);
+int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster,
+                       u32 *num_clusters, unsigned int *extent_flags);
+int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
+                                u64 *ret_count, unsigned int *extent_flags);
 #endif  /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index f2cd3bf9efb2..520a2a6d7670 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -33,6 +33,7 @@
 #include <linux/sched.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/mount.h>
+#include <linux/writeback.h>
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
@@ -215,7 +216,7 @@ int ocfs2_set_inode_size(handle_t *handle,
        mlog_entry_void();
        i_size_write(inode, new_i_size);
-        inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
+        inode->i_blocks = ocfs2_inode_sector_count(inode);
        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
        status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
@@ -261,6 +262,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 {
        int status;
        handle_t *handle;
+        struct ocfs2_dinode *di;
        mlog_entry_void();
@@ -274,12 +276,39 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
                goto out;
        }
-        status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size);
+        status = ocfs2_journal_access(handle, inode, fe_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_commit;
+        }
+        /*
+         * Do this before setting i_size.
+         */
+        status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size);
+        if (status) {
+                mlog_errno(status);
+                goto out_commit;
+        }
+        i_size_write(inode, new_i_size);
+        inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
+        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+        di = (struct ocfs2_dinode *) fe_bh->b_data;
+        di->i_size = cpu_to_le64(new_i_size);
+        di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
+        di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+        status = ocfs2_journal_dirty(handle, fe_bh);
        if (status < 0)
                mlog_errno(status);
+out_commit:
        ocfs2_commit_trans(osb, handle);
 out:
        mlog_exit(status);
        return status;
 }
@@ -342,19 +371,6 @@ static int ocfs2_truncate_file(struct inode *inode,
                mlog_errno(status);
                goto bail;
        }
-        ocfs2_data_unlock(inode, 1);
-        if (le32_to_cpu(fe->i_clusters) ==
-            ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {
-                mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",
-                     fe->i_clusters);
-                /* No allocation change is required, so lets fast path
-                 * this truncate. */
-                status = ocfs2_simple_size_update(inode, di_bh, new_i_size);
-                if (status < 0)
-                        mlog_errno(status);
-                goto bail;
-        }
        /* alright, we're going to need to do a full blown alloc size
         * change. Orphan the inode so that recovery can complete the
@@ -363,22 +379,25 @@ static int ocfs2_truncate_file(struct inode *inode,
        status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
        if (status < 0) {
                mlog_errno(status);
-                goto bail;
+                goto bail_unlock_data;
        }
        status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
        if (status < 0) {
                mlog_errno(status);
-                goto bail;
+                goto bail_unlock_data;
        }
        status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
        if (status < 0) {
                mlog_errno(status);
-                goto bail;
+                goto bail_unlock_data;
        }
        /* TODO: orphan dir cleanup here. */
+bail_unlock_data:
+        ocfs2_data_unlock(inode, 1);
 bail:
        mlog_exit(status);
@@ -397,6 +416,7 @@ bail:
 */
 int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
                               struct inode *inode,
+                               u32 *logical_offset,
                               u32 clusters_to_add,
                               struct buffer_head *fe_bh,
                               handle_t *handle,
@@ -460,18 +480,14 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
        block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
        mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
             num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
-        status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block,
+        status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
-                                     num_bits, meta_ac);
+                                     *logical_offset, block, num_bits,
+                                     meta_ac);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
-        le32_add_cpu(&fe->i_clusters, num_bits);
-        spin_lock(&OCFS2_I(inode)->ip_lock);
-        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
        status = ocfs2_journal_dirty(handle, fe_bh);
        if (status < 0) {
                mlog_errno(status);
@@ -479,6 +495,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
        }
        clusters_to_add -= num_bits;
+        *logical_offset += num_bits;
        if (clusters_to_add) {
                mlog(0, "need to alloc once more, clusters = %u, wanted = "
@@ -494,14 +511,87 @@ leave:
        return status;
 }
+/*
+ * For a given allocation, determine which allocators will need to be
+ * accessed, and lock them, reserving the appropriate number of bits.
+ *
+ * Called from ocfs2_extend_allocation() for file systems which don't
+ * support holes, and from ocfs2_write() for file systems which
+ * understand sparse inodes.
+ */
+int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
+                          u32 clusters_to_add,
+                          struct ocfs2_alloc_context **data_ac,
+                          struct ocfs2_alloc_context **meta_ac)
+{
+        int ret, num_free_extents;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        *meta_ac = NULL;
+        *data_ac = NULL;
+        mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
+             "clusters_to_add = %u\n",
+             (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
+             le32_to_cpu(di->i_clusters), clusters_to_add);
+        num_free_extents = ocfs2_num_free_extents(osb, inode, di);
+        if (num_free_extents < 0) {
+                ret = num_free_extents;
+                mlog_errno(ret);
+                goto out;
+        }
+        /*
+         * Sparse allocation file systems need to be more conservative
+         * with reserving room for expansion - the actual allocation
+         * happens while we've got a journal handle open so re-taking
+         * a cluster lock (because we ran out of room for another
+         * extent) will violate ordering rules.
+         *
+         * Most of the time we'll only be seeing this 1 cluster at a time
+         * anyway.
+         */
+        if (!num_free_extents ||
+            (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) {
+                ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
+                if (ret < 0) {
+                        if (ret != -ENOSPC)
+                                mlog_errno(ret);
+                        goto out;
+                }
+        }
+        ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
+        if (ret < 0) {
+                if (ret != -ENOSPC)
+                        mlog_errno(ret);
+                goto out;
+        }
+out:
+        if (ret) {
+                if (*meta_ac) {
+                        ocfs2_free_alloc_context(*meta_ac);
+                        *meta_ac = NULL;
+                }
+                /*
+                 * We cannot have an error and a non null *data_ac.
+                 */
+        }
+        return ret;
+}
 static int ocfs2_extend_allocation(struct inode *inode,
                                   u32 clusters_to_add)
 {
        int status = 0;
        int restart_func = 0;
        int drop_alloc_sem = 0;
-        int credits, num_free_extents;
+        int credits;
-        u32 prev_clusters;
+        u32 prev_clusters, logical_start;
        struct buffer_head *bh = NULL;
        struct ocfs2_dinode *fe = NULL;
        handle_t *handle = NULL;
@@ -512,6 +602,12 @@ static int ocfs2_extend_allocation(struct inode *inode,
        mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
+        /*
+         * This function only exists for file systems which don't
+         * support holes.
+         */
+        BUG_ON(ocfs2_sparse_alloc(osb));
        status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
                                  OCFS2_BH_CACHED, inode);
        if (status < 0) {
@@ -526,39 +622,11 @@ static int ocfs2_extend_allocation(struct inode *inode,
                goto leave;
        }
+        logical_start = OCFS2_I(inode)->ip_clusters;
 restart_all:
        BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
-        mlog(0, "extend inode %llu, i_size = %lld, fe->i_clusters = %u, "
-             "clusters_to_add = %u\n",
-             (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
-             fe->i_clusters, clusters_to_add);
-        num_free_extents = ocfs2_num_free_extents(osb,
-                                                  inode,
-                                                  fe);
-        if (num_free_extents < 0) {
-                status = num_free_extents;
-                mlog_errno(status);
-                goto leave;
-        }
-        if (!num_free_extents) {
-                status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac);
-                if (status < 0) {
-                        if (status != -ENOSPC)
-                                mlog_errno(status);
-                        goto leave;
-                }
-        }
-        status = ocfs2_reserve_clusters(osb, clusters_to_add, &data_ac);
-        if (status < 0) {
-                if (status != -ENOSPC)
-                        mlog_errno(status);
-                goto leave;
-        }
        /* blocks peope in read/write from reading our allocation
         * until we're done changing it. We depend on i_mutex to block
         * other extend/truncate calls while we're here. Ordering wrt
@@ -566,6 +634,13 @@ restart_all:
        down_write(&OCFS2_I(inode)->ip_alloc_sem);
        drop_alloc_sem = 1;
+        status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,
+                                       &meta_ac);
+        if (status) {
+                mlog_errno(status);
+                goto leave;
+        }
        credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
        handle = ocfs2_start_trans(osb, credits);
        if (IS_ERR(handle)) {
@@ -590,6 +665,7 @@ restarted_transaction:
        status = ocfs2_do_extend_allocation(osb,
                                            inode,
+                                            &logical_start,
                                            clusters_to_add,
                                            bh,
                                            handle,
@@ -778,7 +854,7 @@ static int ocfs2_extend_file(struct inode *inode,
                             size_t tail_to_skip)
 {
        int ret = 0;
-        u32 clusters_to_add;
+        u32 clusters_to_add = 0;
        BUG_ON(!tail_to_skip && !di_bh);
@@ -790,6 +866,11 @@ static int ocfs2_extend_file(struct inode *inode,
                goto out;
        BUG_ON(new_i_size < i_size_read(inode));
+        if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
+                BUG_ON(tail_to_skip != 0);
+                goto out_update_size;
+        }
        clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - 
                OCFS2_I(inode)->ip_clusters;
@@ -825,6 +906,7 @@ static int ocfs2_extend_file(struct inode *inode,
                goto out_unlock;
        }
+out_update_size:
        if (!tail_to_skip) {
                /* We're being called from ocfs2_setattr() which wants
                 * us to update i_size */
@@ -834,7 +916,8 @@ static int ocfs2_extend_file(struct inode *inode,
        }
 out_unlock:
-        ocfs2_data_unlock(inode, 1);
+        if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+                ocfs2_data_unlock(inode, 1);
 out:
        return ret;
@@ -972,7 +1055,8 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
        ret = ocfs2_meta_lock(inode, NULL, 0);
        if (ret) {
-                mlog_errno(ret);
+                if (ret != -ENOENT)
+                        mlog_errno(ret);
                goto out;
        }
@@ -1035,10 +1119,49 @@ out:
        return ret;
 }
+/*
+ * Will look for holes and unwritten extents in the range starting at
+ * pos for count bytes (inclusive).
+ */
+static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
+                                       size_t count)
+{
+        int ret = 0;
+        unsigned int extent_flags;
+        u32 cpos, clusters, extent_len, phys_cpos;
+        struct super_block *sb = inode->i_sb;
+        cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
+        clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
+        while (clusters) {
+                ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
+                                         &extent_flags);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
+                        ret = 1;
+                        break;
+                }
+                if (extent_len > clusters)
+                        extent_len = clusters;
+                clusters -= extent_len;
+                cpos += extent_len;
+        }
+out:
+        return ret;
+}
 static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
                                         loff_t *ppos,
                                         size_t count,
-                                         int appending)
+                                         int appending,
+                                         int *direct_io)
 {
        int ret = 0, meta_level = appending;
        struct inode *inode = dentry->d_inode;
@@ -1089,6 +1212,49 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
                } else {
                        saved_pos = *ppos;
                }
+                if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
+                        loff_t end = saved_pos + count;
+                        /*
+                         * Skip the O_DIRECT checks if we don't need
+                         * them.
+                         */
+                        if (!direct_io || !(*direct_io))
+                                break;
+                        /*
+                         * Allowing concurrent direct writes means
+                         * i_size changes wouldn't be synchronized, so
+                         * one node could wind up truncating another
+                         * nodes writes.
+                         */
+                        if (end > i_size_read(inode)) {
+                                *direct_io = 0;
+                                break;
+                        }
+                        /*
+                         * We don't fill holes during direct io, so
+                         * check for them here. If any are found, the
+                         * caller will have to retake some cluster
+                         * locks and initiate the io as buffered.
+                         */
+                        ret = ocfs2_check_range_for_holes(inode, saved_pos,
+                                                          count);
+                        if (ret == 1) {
+                                *direct_io = 0;
+                                ret = 0;
+                        } else if (ret < 0)
+                                mlog_errno(ret);
+                        break;
+                }
+                /*
+                 * The rest of this loop is concerned with legacy file
+                 * systems which don't support sparse files.
+                 */
                newsize = count + saved_pos;
                mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",
@@ -1141,55 +1307,264 @@ out:
        return ret;
 }
+static inline void
+ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
+{
+        const struct iovec *iov = *iovp;
+        size_t base = *basep;
+        do {
+                int copy = min(bytes, iov->iov_len - base);
+                bytes -= copy;
+                base += copy;
+                if (iov->iov_len == base) {
+                        iov++;
+                        base = 0;
+                }
+        } while (bytes);
+        *iovp = iov;
+        *basep = base;
+}
+static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp,
+                                            const struct iovec *cur_iov,
+                                            size_t iov_offset)
+{
+        int ret;
+        char *buf;
+        struct page *src_page = NULL;
+        buf = cur_iov->iov_base + iov_offset;
+        if (!segment_eq(get_fs(), KERNEL_DS)) {
+                /*
+                 * Pull in the user page. We want to do this outside
+                 * of the meta data locks in order to preserve locking
+                 * order in case of page fault.
+                 */
+                ret = get_user_pages(current, current->mm,
+                                     (unsigned long)buf & PAGE_CACHE_MASK, 1,
+                                     0, 0, &src_page, NULL);
+                if (ret == 1)
+                        bp->b_src_buf = kmap(src_page);
+                else
+                        src_page = ERR_PTR(-EFAULT);
+        } else {
+                bp->b_src_buf = buf;
+        }
+        return src_page;
+}
+static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp,
+                                   struct page *page)
+{
+        if (page) {
+                kunmap(page);
+                page_cache_release(page);
+        }
+}
+static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
+                                         const struct iovec *iov,
+                                         unsigned long nr_segs,
+                                         size_t count,
+                                         ssize_t o_direct_written)
+{
+        int ret = 0;
+        ssize_t copied, total = 0;
+        size_t iov_offset = 0;
+        const struct iovec *cur_iov = iov;
+        struct ocfs2_buffered_write_priv bp;
+        struct page *page;
+        /*
+         * handle partial DIO write.  Adjust cur_iov if needed.
+         */
+        ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written);
+        do {
+                bp.b_cur_off = iov_offset;
+                bp.b_cur_iov = cur_iov;
+                page = ocfs2_get_write_source(&bp, cur_iov, iov_offset);
+                if (IS_ERR(page)) {
+                        ret = PTR_ERR(page);
+                        goto out;
+                }
+                copied = ocfs2_buffered_write_cluster(file, *ppos, count,
+                                                      ocfs2_map_and_write_user_data,
+                                                      &bp);
+                ocfs2_put_write_source(&bp, page);
+                if (copied < 0) {
+                        mlog_errno(copied);
+                        ret = copied;
+                        goto out;
+                }
+                total += copied;
+                *ppos = *ppos + copied;
+                count -= copied;
+                ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied);
+        } while(count);
+out:
+        return total ? total : ret;
+}
+static int ocfs2_check_iovec(const struct iovec *iov, size_t *counted,
+                             unsigned long *nr_segs)
+{
+        size_t ocount;          /* original count */
+        unsigned long seg;
+        ocount = 0;
+        for (seg = 0; seg < *nr_segs; seg++) {
+                const struct iovec *iv = &iov[seg];
+                /*
+                 * If any segment has a negative length, or the cumulative
+                 * length ever wraps negative then return -EINVAL.
+                 */
+                ocount += iv->iov_len;
+                if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
+                        return -EINVAL;
+                if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
+                        continue;
+                if (seg == 0)
+                        return -EFAULT;
+                *nr_segs = seg;
+                ocount -= iv->iov_len;  /* This segment is no good */
+                break;
+        }
+        *counted = ocount;
+        return 0;
+}
 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
                                    const struct iovec *iov,
                                    unsigned long nr_segs,
                                    loff_t pos)
 {
-        int ret, rw_level, have_alloc_sem = 0;
+        int ret, direct_io, appending, rw_level, have_alloc_sem  = 0;
-        struct file *filp = iocb->ki_filp;
+        int can_do_direct, sync = 0;
-        struct inode *inode = filp->f_path.dentry->d_inode;
+        ssize_t written = 0;
-        int appending = filp->f_flags & O_APPEND ? 1 : 0;
+        size_t ocount;          /* original count */
+        size_t count;           /* after file limit checks */
-        mlog_entry("(0x%p, %u, '%.*s')\n", filp,
+        loff_t *ppos = &iocb->ki_pos;
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = file->f_path.dentry->d_inode;
+        mlog_entry("(0x%p, %u, '%.*s')\n", file,
                   (unsigned int)nr_segs,
-                   filp->f_path.dentry->d_name.len,
+                   file->f_path.dentry->d_name.len,
-                   filp->f_path.dentry->d_name.name);
+                   file->f_path.dentry->d_name.name);
-        /* happy write of zero bytes */
        if (iocb->ki_left == 0)
                return 0;
+        ret = ocfs2_check_iovec(iov, &ocount, &nr_segs);
+        if (ret)
+                return ret;
+        count = ocount;
+        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+        appending = file->f_flags & O_APPEND ? 1 : 0;
+        direct_io = file->f_flags & O_DIRECT ? 1 : 0;
        mutex_lock(&inode->i_mutex);
+relock:
        /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
-        if (filp->f_flags & O_DIRECT) {
+        if (direct_io) {
-                have_alloc_sem = 1;
                down_read(&inode->i_alloc_sem);
+                have_alloc_sem = 1;
        }
        /* concurrent O_DIRECT writes are allowed */
-        rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1;
+        rw_level = !direct_io;
        ret = ocfs2_rw_lock(inode, rw_level);
        if (ret < 0) {
-                rw_level = -1;
                mlog_errno(ret);
-                goto out;
+                goto out_sems;
        }
-        ret = ocfs2_prepare_inode_for_write(filp->f_path.dentry, &iocb->ki_pos,
+        can_do_direct = direct_io;
-                                            iocb->ki_left, appending);
+        ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
+                                            iocb->ki_left, appending,
+                                            &can_do_direct);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
        }
-        /* communicate with ocfs2_dio_end_io */
+        /*
-        ocfs2_iocb_set_rw_locked(iocb);
+         * We can't complete the direct I/O as requested, fall back to
+         * buffered I/O.
+         */
+        if (direct_io && !can_do_direct) {
+                ocfs2_rw_unlock(inode, rw_level);
+                up_read(&inode->i_alloc_sem);
+                have_alloc_sem = 0;
+                rw_level = -1;
-        ret = generic_file_aio_write_nolock(iocb, iov, nr_segs, iocb->ki_pos);
+                direct_io = 0;
+                sync = 1;
+                goto relock;
+        }
+        if (!sync && ((file->f_flags & O_SYNC) || IS_SYNC(inode)))
+                sync = 1;
+        /*
+         * XXX: Is it ok to execute these checks a second time?
+         */
+        ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode));
+        if (ret)
+                goto out;
+        /*
+         * Set pos so that sync_page_range_nolock() below understands
+         * where to start from. We might've moved it around via the
+         * calls above. The range we want to actually sync starts from
+         * *ppos here.
+         *
+         */
+        pos = *ppos;
+        /* communicate with ocfs2_dio_end_io */
+        ocfs2_iocb_set_rw_locked(iocb, rw_level);
+        if (direct_io) {
+                written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
+                                                    ppos, count, ocount);
+                if (written < 0) {
+                        ret = written;
+                        goto out_dio;
+                }
+        } else {
+                written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs,
+                                                    count, written);
+                if (written < 0) {
+                        ret = written;
+                        if (ret != -EFAULT || ret != -ENOSPC)
+                                mlog_errno(ret);
+                        goto out;
+                }
+        }
+out_dio:
        /* buffered aio wouldn't have proper lock coverage today */
-        BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
+        BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
        /* 
         * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
@@ -1207,13 +1582,102 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
        }
 out:
+        if (rw_level != -1)
+                ocfs2_rw_unlock(inode, rw_level);
+out_sems:
        if (have_alloc_sem)
                up_read(&inode->i_alloc_sem);
-        if (rw_level != -1) 
-                ocfs2_rw_unlock(inode, rw_level);
+        if (written > 0 && sync) {
+                ssize_t err;
+                err = sync_page_range_nolock(inode, file->f_mapping, pos, count);
+                if (err < 0)
+                        written = err;
+        }
        mutex_unlock(&inode->i_mutex);
        mlog_exit(ret);
+        return written ? written : ret;
+}
+static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe,
+                                    struct pipe_buffer *buf,
+                                    struct splice_desc *sd)
+{
+        int ret, count, total = 0;
+        ssize_t copied = 0;
+        struct ocfs2_splice_write_priv sp;
+        ret = buf->ops->pin(pipe, buf);
+        if (ret)
+                goto out;
+        sp.s_sd = sd;
+        sp.s_buf = buf;
+        sp.s_pipe = pipe;
+        sp.s_offset = sd->pos & ~PAGE_CACHE_MASK;
+        sp.s_buf_offset = buf->offset;
+        count = sd->len;
+        if (count + sp.s_offset > PAGE_CACHE_SIZE)
+                count = PAGE_CACHE_SIZE - sp.s_offset;
+        do {
+                /*
+                 * splice wants us to copy up to one page at a
+                 * time. For pagesize > cluster size, this means we
+                 * might enter ocfs2_buffered_write_cluster() more
+                 * than once, so keep track of our progress here.
+                 */
+                copied = ocfs2_buffered_write_cluster(sd->file,
+                                                      (loff_t)sd->pos + total,
+                                                      count,
+                                                      ocfs2_map_and_write_splice_data,
+                                                      &sp);
+                if (copied < 0) {
+                        mlog_errno(copied);
+                        ret = copied;
+                        goto out;
+                }
+                count -= copied;
+                sp.s_offset += copied;
+                sp.s_buf_offset += copied;
+                total += copied;
+        } while (count);
+        ret = 0;
+out:
+        return total ? total : ret;
+}
+static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe,
+                                         struct file *out,
+                                         loff_t *ppos,
+                                         size_t len,
+                                         unsigned int flags)
+{
+        int ret, err;
+        struct address_space *mapping = out->f_mapping;
+        struct inode *inode = mapping->host;
+        ret = __splice_from_pipe(pipe, out, ppos, len, flags,
+                                 ocfs2_splice_write_actor);
+        if (ret > 0) {
+                *ppos += ret;
+                if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
+                        err = generic_osync_inode(inode, mapping,
+                                                  OSYNC_METADATA|OSYNC_DATA);
+                        if (err)
+                                ret = err;
+                }
+        }
        return ret;
 }
@@ -1239,14 +1703,15 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
                goto out;
        }
-        ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0);
+        ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0,
+                                            NULL);
        if (ret < 0) {
                mlog_errno(ret);
                goto out_unlock;
        }
        /* ok, we're done with i_size and alloc work */
-        ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags);
+        ret = __ocfs2_file_splice_write(pipe, out, ppos, len, flags);
 out_unlock:
        ocfs2_rw_unlock(inode, 1);
@@ -1323,7 +1788,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
                }
                rw_level = 0;
                /* communicate with ocfs2_dio_end_io */
-                ocfs2_iocb_set_rw_locked(iocb);
+                ocfs2_iocb_set_rw_locked(iocb, rw_level);
        }
        /*
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index cc973f01f6ce..2c4460fced52 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -39,12 +39,17 @@ enum ocfs2_alloc_restarted {
 };
 int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
                               struct inode *inode,
+                               u32 *cluster_start,
                               u32 clusters_to_add,
                               struct buffer_head *fe_bh,
                               handle_t *handle,
                               struct ocfs2_alloc_context *data_ac,
                               struct ocfs2_alloc_context *meta_ac,
                               enum ocfs2_alloc_restarted *reason);
+int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
+                          u32 clusters_to_add,
+                          struct ocfs2_alloc_context **data_ac,
+                          struct ocfs2_alloc_context **meta_ac);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
 int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
                  struct kstat *stat);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 28ab56f2b98c..21a605079c62 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -89,24 +89,6 @@ void ocfs2_set_inode_flags(struct inode *inode)
                inode->i_flags |= S_DIRSYNC;
 }
-struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
-                                     u64 blkno,
-                                     int delete_vote)
-{
-        struct ocfs2_find_inode_args args;
-        /* ocfs2_ilookup_for_vote should *only* be called from the
-         * vote thread */
-        BUG_ON(current != osb->vote_task);
-        args.fi_blkno = blkno;
-        args.fi_flags = OCFS2_FI_FLAG_NOWAIT;
-        if (delete_vote)
-                args.fi_flags |= OCFS2_FI_FLAG_DELETE;
-        args.fi_ino = ino_from_blkno(osb->sb, blkno);
-        return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args);
-}
 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags)
 {
        struct inode *inode = NULL;
@@ -182,28 +164,6 @@ static int ocfs2_find_actor(struct inode *inode, void *opaque)
        if (oi->ip_blkno != args->fi_blkno)
                goto bail;
-        /* OCFS2_FI_FLAG_NOWAIT is *only* set from
-         * ocfs2_ilookup_for_vote which won't create an inode for one
-         * that isn't found. The vote thread which doesn't want to get
-         * an inode which is in the process of going away - otherwise
-         * the call to __wait_on_freeing_inode in find_inode_fast will
-         * cause it to deadlock on an inode which may be waiting on a
-         * vote (or lock release) in delete_inode */
-        if ((args->fi_flags & OCFS2_FI_FLAG_NOWAIT) &&
-            (inode->i_state & (I_FREEING|I_CLEAR))) {
-                /* As stated above, we're not going to return an
-                 * inode.  In the case of a delete vote, the voting
-                 * code is going to signal the other node to go
-                 * ahead. Mark that state here, so this freeing inode
-                 * has the state when it gets to delete_inode. */
-                if (args->fi_flags & OCFS2_FI_FLAG_DELETE) {
-                        spin_lock(&oi->ip_lock);
-                        ocfs2_mark_inode_remotely_deleted(inode);
-                        spin_unlock(&oi->ip_lock);
-                }
-                goto bail;
-        }
        ret = 1;
 bail:
        mlog_exit(ret);
@@ -261,6 +221,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
                goto bail;
        }
+        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+        OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
        inode->i_version = 1;
        inode->i_generation = le32_to_cpu(fe->i_generation);
        inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
@@ -272,8 +235,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
        if (S_ISLNK(inode->i_mode) && !fe->i_clusters)
                inode->i_blocks = 0;
        else
-                inode->i_blocks =
+                inode->i_blocks = ocfs2_inode_sector_count(inode);
-                        ocfs2_align_bytes_to_sectors(le64_to_cpu(fe->i_size));
        inode->i_mapping->a_ops = &ocfs2_aops;
        inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
        inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
@@ -288,10 +250,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
                     (unsigned long long)fe->i_blkno);
-        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
-        OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT;
-        OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
        inode->i_nlink = le16_to_cpu(fe->i_links_count);
        if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL))
@@ -347,6 +305,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
                ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
                                          OCFS2_LOCK_TYPE_META, 0, inode);
+                ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
+                                          OCFS2_LOCK_TYPE_OPEN, 0, inode);
        }
        ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres,
@@ -421,7 +382,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
         * cluster lock before trusting anything anyway.
         */
        can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
-                && !(args->fi_flags & OCFS2_FI_FLAG_NOLOCK)
+                && !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY)
                && !ocfs2_mount_local(osb);
        /*
@@ -438,7 +399,17 @@ static int ocfs2_read_locked_inode(struct inode *inode,
                                  OCFS2_LOCK_TYPE_META,
                                  generation, inode);
+        ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
+                                  OCFS2_LOCK_TYPE_OPEN,
+                                  0, inode);
        if (can_lock) {
+                status = ocfs2_open_lock(inode);
+                if (status) {
+                        make_bad_inode(inode);
+                        mlog_errno(status);
+                        return status;
+                }
                status = ocfs2_meta_lock(inode, NULL, 0);
                if (status) {
                        make_bad_inode(inode);
@@ -447,6 +418,14 @@ static int ocfs2_read_locked_inode(struct inode *inode,
                }
        }
+        if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) {
+                status = ocfs2_try_open_lock(inode, 0);
+                if (status) {
+                        make_bad_inode(inode);  
+                        return status;
+                }
+        }
        status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0,
                                  can_lock ? inode : NULL);
        if (status < 0) {
@@ -507,50 +486,56 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
                                     struct buffer_head *fe_bh)
 {
        int status = 0;
-        handle_t *handle = NULL;
        struct ocfs2_truncate_context *tc = NULL;
        struct ocfs2_dinode *fe;
+        handle_t *handle = NULL;
        mlog_entry_void();
        fe = (struct ocfs2_dinode *) fe_bh->b_data;
-        /* zero allocation, zero truncate :) */
+        if (fe->i_clusters) {
-        if (!fe->i_clusters)
+                handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-                goto bail;
+                if (IS_ERR(handle)) {
+                        status = PTR_ERR(handle);
+                        mlog_errno(status);
+                        goto out;
+                }
-        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+                status = ocfs2_journal_access(handle, inode, fe_bh,
-        if (IS_ERR(handle)) {
+                                              OCFS2_JOURNAL_ACCESS_WRITE);
-                status = PTR_ERR(handle);
+                if (status < 0) {
-                handle = NULL;
+                        mlog_errno(status);
-                mlog_errno(status);
+                        goto out;
-                goto bail;
+                }
-        }
-        status = ocfs2_set_inode_size(handle, inode, fe_bh, 0ULL);
+                i_size_write(inode, 0);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
-        ocfs2_commit_trans(osb, handle);
+                status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
-        handle = NULL;
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto out;
+                }
-        status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
+                ocfs2_commit_trans(osb, handle);
-        if (status < 0) {
+                handle = NULL;
-                mlog_errno(status);
-                goto bail;
-        }
-        status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
+                status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
-        if (status < 0) {
+                if (status < 0) {
-                mlog_errno(status);
+                        mlog_errno(status);
-                goto bail;
+                        goto out;
+                }
+                status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto out;
+                }
        }
-bail:
+out:
        if (handle)
                ocfs2_commit_trans(osb, handle);
        mlog_exit(status);
        return status;
 }
@@ -678,10 +663,10 @@ static int ocfs2_wipe_inode(struct inode *inode,
        struct inode *orphan_dir_inode = NULL;
        struct buffer_head *orphan_dir_bh = NULL;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_dinode *di;
-        /* We've already voted on this so it should be readonly - no
+        di = (struct ocfs2_dinode *) di_bh->b_data;
-         * spinlock needed. */
+        orphaned_slot = le16_to_cpu(di->i_orphaned_slot);
-        orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
        status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
        if (status)
@@ -839,11 +824,20 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
                goto bail;
        }
-        status = ocfs2_request_delete_vote(inode);
+        /*
-        /* -EBUSY means that other nodes are still using the
+         * This is how ocfs2 determines whether an inode is still live
-         * inode. We're done here though, so avoid doing anything on
+         * within the cluster. Every node takes a shared read lock on
-         * disk and let them worry about deleting it. */
+         * the inode open lock in ocfs2_read_locked_inode(). When we
-        if (status == -EBUSY) {
+         * get to ->delete_inode(), each node tries to convert it's
+         * lock to an exclusive. Trylocks are serialized by the inode
+         * meta data lock. If the upconvert suceeds, we know the inode
+         * is no longer live and can be deleted.
+         *
+         * Though we call this with the meta data lock held, the
+         * trylock keeps us from ABBA deadlock.
+         */
+        status = ocfs2_try_open_lock(inode, 1);
+        if (status == -EAGAIN) {
                status = 0;
                mlog(0, "Skipping delete of %llu because it is in use on"
                     "other nodes\n", (unsigned long long)oi->ip_blkno);
@@ -854,21 +848,10 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
                goto bail;
        }
-        spin_lock(&oi->ip_lock);
+        *wipe = 1;
-        if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) {
+        mlog(0, "Inode %llu is ok to wipe from orphan dir %u\n",
-                /* Nobody knew which slot this inode was orphaned
+             (unsigned long long)oi->ip_blkno,
-                 * into. This may happen during node death and
+             le16_to_cpu(di->i_orphaned_slot));
-                 * recovery knows how to clean it up so we can safely
-                 * ignore this inode for now on. */
-                mlog(0, "Nobody knew where inode %llu was orphaned!\n",
-                     (unsigned long long)oi->ip_blkno);
-        } else {
-                *wipe = 1;
-                mlog(0, "Inode %llu is ok to wipe from orphan dir %d\n",
-                     (unsigned long long)oi->ip_blkno, oi->ip_orphaned_slot);
-        }
-        spin_unlock(&oi->ip_lock);
 bail:
        return status;
@@ -1001,11 +984,16 @@ void ocfs2_clear_inode(struct inode *inode)
        mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
                        "Inode=%lu\n", inode->i_ino);
+        /* For remove delete_inode vote, we hold open lock before,
+         * now it is time to unlock PR and EX open locks. */
+        ocfs2_open_unlock(inode);
        /* Do these before all the other work so that we don't bounce
         * the vote thread while waiting to destroy the locks. */
        ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
        ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
        ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
+        ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
        /* We very well may get a clear_inode before all an inodes
         * metadata has hit disk. Of course, we can't drop any cluster
@@ -1020,8 +1008,7 @@ void ocfs2_clear_inode(struct inode *inode)
                        "Clear inode of %llu, inode has io markers\n",
                        (unsigned long long)oi->ip_blkno);
-        ocfs2_extent_map_drop(inode, 0);
+        ocfs2_extent_map_trunc(inode, 0);
-        ocfs2_extent_map_init(inode);
        status = ocfs2_drop_inode_locks(inode);
        if (status < 0)
@@ -1030,6 +1017,7 @@ void ocfs2_clear_inode(struct inode *inode)
        ocfs2_lock_res_free(&oi->ip_rw_lockres);
        ocfs2_lock_res_free(&oi->ip_meta_lockres);
        ocfs2_lock_res_free(&oi->ip_data_lockres);
+        ocfs2_lock_res_free(&oi->ip_open_lockres);
        ocfs2_metadata_cache_purge(inode);
@@ -1086,9 +1074,6 @@ void ocfs2_drop_inode(struct inode *inode)
        mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n",
             (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags);
-        /* Testing ip_orphaned_slot here wouldn't work because we may
-         * not have gotten a delete_inode vote from any other nodes
-         * yet. */
        if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
                generic_delete_inode(inode);
        else
@@ -1121,8 +1106,8 @@ struct buffer_head *ocfs2_bread(struct inode *inode,
                return NULL;
        }
-        tmperr = ocfs2_extent_map_get_blocks(inode, block, 1,
+        tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
-                                             &p_blkno, NULL);
+                                             NULL);
        if (tmperr < 0) {
                mlog_errno(tmperr);
                goto fail;
@@ -1259,7 +1244,7 @@ void ocfs2_refresh_inode(struct inode *inode,
        if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0)
                inode->i_blocks = 0;
        else
-                inode->i_blocks = ocfs2_align_bytes_to_sectors(i_size_read(inode));
+                inode->i_blocks = ocfs2_inode_sector_count(inode);
        inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
        inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
        inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 1a7dd2945b34..03ae075869ee 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -26,6 +26,8 @@
 #ifndef OCFS2_INODE_H
 #define OCFS2_INODE_H
+#include "extent_map.h"
 /* OCFS2 Inode Private Data */
 struct ocfs2_inode_info
 {
@@ -34,6 +36,7 @@ struct ocfs2_inode_info
        struct ocfs2_lock_res           ip_rw_lockres;
        struct ocfs2_lock_res           ip_meta_lockres;
        struct ocfs2_lock_res           ip_data_lockres;
+        struct ocfs2_lock_res           ip_open_lockres;
        /* protects allocation changes on this inode. */
        struct rw_semaphore             ip_alloc_sem;
@@ -42,9 +45,7 @@ struct ocfs2_inode_info
        spinlock_t                      ip_lock;
        u32                             ip_open_count;
        u32                             ip_clusters;
-        struct ocfs2_extent_map         ip_map;
        struct list_head                ip_io_markers;
-        int                             ip_orphaned_slot;
        struct mutex                    ip_io_mutex;
@@ -64,6 +65,8 @@ struct ocfs2_inode_info
        struct ocfs2_caching_info       ip_metadata_cache;
+        struct ocfs2_extent_map         ip_extent_map;
        struct inode                    vfs_inode;
 };
@@ -117,14 +120,9 @@ void ocfs2_delete_inode(struct inode *inode);
 void ocfs2_drop_inode(struct inode *inode);
 /* Flags for ocfs2_iget() */
-#define OCFS2_FI_FLAG_NOWAIT    0x1
+#define OCFS2_FI_FLAG_SYSFILE           0x4
-#define OCFS2_FI_FLAG_DELETE    0x2
+#define OCFS2_FI_FLAG_ORPHAN_RECOVERY   0x8
-#define OCFS2_FI_FLAG_SYSFILE   0x4
-#define OCFS2_FI_FLAG_NOLOCK    0x8
 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags);
-struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
-                                     u64 blkno,
-                                     int delete_vote);
 int ocfs2_inode_init_private(struct inode *inode);
 int ocfs2_inode_revalidate(struct dentry *dentry);
 int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
@@ -144,4 +142,11 @@ int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
 void ocfs2_set_inode_flags(struct inode *inode);
+static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
+{
+        int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9;
+        return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits);
+}
 #endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 825cb0ae1b4c..5a8a90d1c787 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -649,29 +649,20 @@ bail:
 static int ocfs2_force_read_journal(struct inode *inode)
 {
        int status = 0;
-        int i, p_blocks;
+        int i;
-        u64 v_blkno, p_blkno;
+        u64 v_blkno, p_blkno, p_blocks, num_blocks;
-#define CONCURRENT_JOURNAL_FILL 32
+#define CONCURRENT_JOURNAL_FILL 32ULL
        struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
        mlog_entry_void();
-        BUG_ON(inode->i_blocks !=
-                     ocfs2_align_bytes_to_sectors(i_size_read(inode)));
        memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
-        mlog(0, "Force reading %llu blocks\n",
+        num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, inode->i_size);
-                (unsigned long long)(inode->i_blocks >>
-                        (inode->i_sb->s_blocksize_bits - 9)));
        v_blkno = 0;
-        while (v_blkno <
+        while (v_blkno < num_blocks) {
-               (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) {
                status = ocfs2_extent_map_get_blocks(inode, v_blkno,
-                                                     1, &p_blkno,
+                                                     &p_blkno, &p_blocks, NULL);
-                                                     &p_blocks);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -1306,7 +1297,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
                                continue;
                        iter = ocfs2_iget(osb, le64_to_cpu(de->inode),
-                                          OCFS2_FI_FLAG_NOLOCK);
+                                          OCFS2_FI_FLAG_ORPHAN_RECOVERY);
                        if (IS_ERR(iter))
                                continue;
@@ -1418,7 +1409,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                /* Set the proper information to get us going into
                 * ocfs2_delete_inode. */
                oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
-                oi->ip_orphaned_slot = slot;
                spin_unlock(&oi->ip_lock);
                iput(inode);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index d026b4f27757..3db5de4506da 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -390,7 +390,7 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
        /* We may be deleting metadata blocks, so metadata alloc dinode +
           one desc. block for each possible delete. */
        if (tree_depth && next_free == 1 &&
-            le32_to_cpu(last_el->l_recs[i].e_clusters) == clusters_to_del)
+            ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del)
                credits += 1 + tree_depth;
        /* update to the truncate log. */
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 51b020447683..af01158b39f5 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -85,8 +85,11 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
        int ret = 0, lock_level = 0;
        struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb);
-        /* We don't want to support shared writable mappings yet. */
+        /*
-        if (!ocfs2_mount_local(osb) &&
+         * Only support shared writeable mmap for local mounts which
+         * don't know about holes.
+         */
+        if ((!ocfs2_mount_local(osb) || ocfs2_sparse_alloc(osb)) &&
            ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) &&
            ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) {
                mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 28dd757ff67d..2bcf353fd7c5 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -175,8 +175,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
        inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0);
        if (IS_ERR(inode)) {
-                mlog(ML_ERROR, "Unable to create inode %llu\n",
-                     (unsigned long long)blkno);
                ret = ERR_PTR(-EACCES);
                goto bail_unlock;
        }
@@ -189,7 +187,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
         * unlink. */
        spin_lock(&oi->ip_lock);
        oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED;
-        oi->ip_orphaned_slot = OCFS2_INVALID_SLOT;
        spin_unlock(&oi->ip_lock);
 bail_add:
@@ -288,7 +285,7 @@ static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
        i_size_write(inode, inode->i_sb->s_blocksize);
        inode->i_nlink = 2;
-        inode->i_blocks = ocfs2_align_bytes_to_sectors(inode->i_sb->s_blocksize);
+        inode->i_blocks = ocfs2_inode_sector_count(inode);
        status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
        if (status < 0) {
                mlog_errno(status);
@@ -1486,8 +1483,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
        struct buffer_head **bhs = NULL;
        const char *c;
        struct super_block *sb = osb->sb;
-        u64 p_blkno;
+        u64 p_blkno, p_blocks;
-        int p_blocks;
        int virtual, blocks, status, i, bytes_left;
        bytes_left = i_size_read(inode) + 1;
@@ -1514,8 +1510,8 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
                goto bail;
        }
-        status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno,
+        status = ocfs2_extent_map_get_blocks(inode, 0, &p_blkno, &p_blocks,
-                                             &p_blocks);
+                                             NULL);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1674,8 +1670,11 @@ static int ocfs2_symlink(struct inode *dir,
        inode->i_rdev = 0;
        newsize = l - 1;
        if (l > ocfs2_fast_symlink_chars(sb)) {
+                u32 offset = 0;
                inode->i_op = &ocfs2_symlink_inode_operations;
-                status = ocfs2_do_extend_allocation(osb, inode, 1, new_fe_bh,
+                status = ocfs2_do_extend_allocation(osb, inode, &offset, 1,
+                                                    new_fe_bh,
                                                    handle, data_ac, NULL,
                                                    NULL);
                if (status < 0) {
@@ -1689,7 +1688,7 @@ static int ocfs2_symlink(struct inode *dir,
                        goto bail;
                }
                i_size_write(inode, newsize);
-                inode->i_blocks = ocfs2_align_bytes_to_sectors(newsize);
+                inode->i_blocks = ocfs2_inode_sector_count(inode);
        } else {
                inode->i_op = &ocfs2_fast_symlink_inode_operations;
                memcpy((char *) fe->id2.i_symlink, symname, l);
@@ -2222,9 +2221,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
        /* Record which orphan dir our inode now resides
         * in. delete_inode will use this to determine which orphan
         * dir to lock. */
-        spin_lock(&OCFS2_I(inode)->ip_lock);
+        fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
-        OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num;
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
        mlog(0, "Inode %llu orphaned in slot %d\n",
             (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index db8e77cd35d3..82cc92dcf8a6 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -46,11 +46,6 @@
 #include "endian.h"
 #include "ocfs2_lockid.h"
-struct ocfs2_extent_map {
-        u32             em_clusters;
-        struct rb_root  em_extents;
-};
 /* Most user visible OCFS2 inodes will have very few pieces of
 * metadata, but larger files (including bitmaps, etc) must be taken
 * into account when designing an access scheme. We allow a small
@@ -303,6 +298,13 @@ static inline int ocfs2_should_order_data(struct inode *inode)
        return 1;
 }
+static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb)
+{
+        if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
+                return 1;
+        return 0;
+}
 /* set / clear functions because cluster events can make these happen
 * in parallel so we want the transitions to be atomic. this also
 * means that any future flags osb_flags must be protected by spinlock
@@ -461,6 +463,49 @@ static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes)
        return (unsigned long)((bytes + 511) >> 9);
 }
+static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb,
+                                                        unsigned long pg_index)
+{
+        u32 clusters = pg_index;
+        unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
+        if (unlikely(PAGE_CACHE_SHIFT > cbits))
+                clusters = pg_index << (PAGE_CACHE_SHIFT - cbits);
+        else if (PAGE_CACHE_SHIFT < cbits)
+                clusters = pg_index >> (cbits - PAGE_CACHE_SHIFT);
+        return clusters;
+}
+/*
+ * Find the 1st page index which covers the given clusters.
+ */
+static inline unsigned long ocfs2_align_clusters_to_page_index(struct super_block *sb,
+                                                        u32 clusters)
+{
+        unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
+        unsigned long index = clusters;
+        if (PAGE_CACHE_SHIFT > cbits) {
+                index = clusters >> (PAGE_CACHE_SHIFT - cbits);
+        } else if (PAGE_CACHE_SHIFT < cbits) {
+                index = clusters << (cbits - PAGE_CACHE_SHIFT);
+        }
+        return index;
+}
+static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
+{
+        unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
+        unsigned int pages_per_cluster = 1;
+        if (PAGE_CACHE_SHIFT < cbits)
+                pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT);
+        return pages_per_cluster;
+}
 #define ocfs2_set_bit ext2_set_bit
 #define ocfs2_clear_bit ext2_clear_bit
 #define ocfs2_test_bit ext2_test_bit
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index e61e218f5e0b..71306479c68f 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -86,7 +86,8 @@
        OCFS2_SB(sb)->s_feature_incompat &= ~(mask)
 #define OCFS2_FEATURE_COMPAT_SUPP       OCFS2_FEATURE_COMPAT_BACKUP_SB
-#define OCFS2_FEATURE_INCOMPAT_SUPP     OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT
+#define OCFS2_FEATURE_INCOMPAT_SUPP     (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
+                                         | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP    0
 /*
@@ -155,6 +156,12 @@
 #define OCFS2_FL_MODIFIABLE     (0x000100FF)    /* User modifiable flags */
 /*
+ * Extent record flags (e_node.leaf.flags)
+ */
+#define OCFS2_EXT_UNWRITTEN     (0x01)  /* Extent is allocated but
+                                         * unwritten */
+/*
 * ioctl commands
 */
 #define OCFS2_IOC_GETFLAGS      _IOR('f', 1, long)
@@ -282,10 +289,21 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
 /*
 * On disk extent record for OCFS2
 * It describes a range of clusters on disk.
+ *
+ * Length fields are divided into interior and leaf node versions.
+ * This leaves room for a flags field (OCFS2_EXT_*) in the leaf nodes.
 */
 struct ocfs2_extent_rec {
 /*00*/  __le32 e_cpos;          /* Offset into the file, in clusters */
-        __le32 e_clusters;      /* Clusters covered by this extent */
+        union {
+                __le32 e_int_clusters; /* Clusters covered by all children */
+                struct {
+                        __le16 e_leaf_clusters; /* Clusters covered by this
+                                                   extent */
+                        __u8 e_reserved1;
+                        __u8 e_flags; /* Extent flags */
+                };
+        };
        __le64 e_blkno;         /* Physical disk offset, in blocks */
 /*10*/
 };
@@ -311,7 +329,10 @@ struct ocfs2_extent_list {
 /*00*/  __le16 l_tree_depth;            /* Extent tree depth from this
                                           point.  0 means data extents
                                           hang directly off this
-                                           header (a leaf) */
+                                           header (a leaf)
+                                           NOTE: The high 8 bits cannot be
+                                           used - tree_depth is never that big.
+                                        */
        __le16 l_count;                 /* Number of extent records */
        __le16 l_next_free_rec;         /* Next unused extent slot */
        __le16 l_reserved1;
@@ -446,7 +467,9 @@ struct ocfs2_dinode {
        __le32 i_ctime_nsec;
        __le32 i_mtime_nsec;
        __le32 i_attr;
-        __le32 i_reserved1;
+        __le16 i_orphaned_slot;         /* Only valid when OCFS2_ORPHANED_FL
+                                           was set in i_flags */
+        __le16 i_reserved1;
 /*70*/  __le64 i_reserved2[8];
 /*B8*/  union {
                __le64 i_pad1;          /* Generic way to refer to this
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 4d5d5655c185..4ca02b1c38ac 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -44,6 +44,7 @@ enum ocfs2_lock_type {
        OCFS2_LOCK_TYPE_RENAME,
        OCFS2_LOCK_TYPE_RW,
        OCFS2_LOCK_TYPE_DENTRY,
+        OCFS2_LOCK_TYPE_OPEN,
        OCFS2_NUM_LOCK_TYPES
 };
@@ -69,6 +70,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
                case OCFS2_LOCK_TYPE_DENTRY:
                        c = 'N';
                        break;
+                case OCFS2_LOCK_TYPE_OPEN:
+                        c = 'O';
+                        break;
                default:
                        c = '\0';
        }
@@ -85,6 +89,7 @@ static char *ocfs2_lock_type_strings[] = {
         * important job it does, anyway. */
        [OCFS2_LOCK_TYPE_RW] = "Write/Read",
        [OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
+        [OCFS2_LOCK_TYPE_OPEN] = "Open",
 };
 static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 2d3ac32cb74e..d921a28329dc 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -197,7 +197,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
                goto bail;
        }
-        status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL);
+        status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL, NULL);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 6dbb11762759..0da655ae5d6f 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -381,8 +381,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
                                             le32_to_cpu(fe->i_clusters)));
        spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
        i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
-        alloc_inode->i_blocks =
+        alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
-                ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode));
        status = 0;
 bail:
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 6534f92424dd..5c9e8243691f 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -806,9 +806,6 @@ static int __init ocfs2_init(void)
        ocfs2_print_version();
-        if (init_ocfs2_extent_maps())
-                return -ENOMEM;
        status = init_ocfs2_uptodate_cache();
        if (status < 0) {
                mlog_errno(status);
@@ -837,7 +834,6 @@ leave:
        if (status < 0) {
                ocfs2_free_mem_caches();
                exit_ocfs2_uptodate_cache();
-                exit_ocfs2_extent_maps();
        }
        mlog_exit(status);
@@ -863,8 +859,6 @@ static void __exit ocfs2_exit(void)
        unregister_filesystem(&ocfs2_fs_type);
-        exit_ocfs2_extent_maps();
        exit_ocfs2_uptodate_cache();
        mlog_exit_void();
@@ -963,6 +957,7 @@ static void ocfs2_inode_init_once(void *data,
                ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
                ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
                ocfs2_lock_res_init_once(&oi->ip_data_lockres);
+                ocfs2_lock_res_init_once(&oi->ip_open_lockres);
                ocfs2_metadata_cache_init(&oi->vfs_inode);
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
index f30e63b9910c..4f82a2f0efef 100644
--- a/fs/ocfs2/vote.c
+++ b/fs/ocfs2/vote.c
@@ -63,17 +63,10 @@ struct ocfs2_msg_hdr
        __be32 h_node_num;    /* node sending this particular message. */
 };
-/* OCFS2_MAX_FILENAME_LEN is 255 characters, but we want to align this
- * for the network. */
-#define OCFS2_VOTE_FILENAME_LEN 256
 struct ocfs2_vote_msg
 {
        struct ocfs2_msg_hdr v_hdr;
-        union {
+        __be32 v_reserved1;
-                __be32 v_generic1;
-                __be32 v_orphaned_slot; /* Used during delete votes */
-                __be32 v_nlink;         /* Used during unlink votes */
-        } md1;                          /* Message type dependant 1 */
 };
 /* Responses are given these values to maintain backwards
@@ -86,7 +79,6 @@ struct ocfs2_response_msg
 {
        struct ocfs2_msg_hdr r_hdr;
        __be32 r_response;
-        __be32 r_orphaned_slot;
 };
 struct ocfs2_vote_work {
@@ -96,7 +88,6 @@ struct ocfs2_vote_work {
 enum ocfs2_vote_request {
        OCFS2_VOTE_REQ_INVALID = 0,
-        OCFS2_VOTE_REQ_DELETE,
        OCFS2_VOTE_REQ_MOUNT,
        OCFS2_VOTE_REQ_UMOUNT,
        OCFS2_VOTE_REQ_LAST
@@ -151,135 +142,23 @@ static void ocfs2_process_umount_request(struct ocfs2_super *osb,
        ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num);
 }
-void ocfs2_mark_inode_remotely_deleted(struct inode *inode)
-{
-        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        assert_spin_locked(&oi->ip_lock);
-        /* We set the SKIP_DELETE flag on the inode so we don't try to
-         * delete it in delete_inode ourselves, thus avoiding
-         * unecessary lock pinging. If the other node failed to wipe
-         * the inode as a result of a crash, then recovery will pick
-         * up the slack. */
-        oi->ip_flags |= OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE;
-}
-static int ocfs2_process_delete_request(struct inode *inode,
-                                        int *orphaned_slot)
-{
-        int response = OCFS2_RESPONSE_BUSY;
-        mlog(0, "DELETE vote on inode %lu, read lnk_cnt = %u, slot = %d\n",
-             inode->i_ino, inode->i_nlink, *orphaned_slot);
-        spin_lock(&OCFS2_I(inode)->ip_lock);
-        /* Whatever our vote response is, we want to make sure that
-         * the orphaned slot is recorded properly on this node *and*
-         * on the requesting node. Technically, if the requesting node
-         * did not know which slot the inode is orphaned in but we
-         * respond with BUSY he doesn't actually need the orphaned
-         * slot, but it doesn't hurt to do it here anyway. */
-        if ((*orphaned_slot) != OCFS2_INVALID_SLOT) {
-                mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot !=
-                                OCFS2_INVALID_SLOT &&
-                                OCFS2_I(inode)->ip_orphaned_slot !=
-                                (*orphaned_slot),
-                                "Inode %llu: This node thinks it's "
-                                "orphaned in slot %d, messaged it's in %d\n",
-                                (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                                OCFS2_I(inode)->ip_orphaned_slot,
-                                *orphaned_slot);
-                mlog(0, "Setting orphaned slot for inode %llu to %d\n",
-                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                     *orphaned_slot);
-                OCFS2_I(inode)->ip_orphaned_slot = *orphaned_slot;
-        } else {
-                mlog(0, "Sending back orphaned slot %d for inode %llu\n",
-                     OCFS2_I(inode)->ip_orphaned_slot,
-                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
-                *orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
-        }
-        /* vote no if the file is still open. */
-        if (OCFS2_I(inode)->ip_open_count) {
-                mlog(0, "open count = %u\n",
-                     OCFS2_I(inode)->ip_open_count);
-                spin_unlock(&OCFS2_I(inode)->ip_lock);
-                goto done;
-        }
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
-        /* directories are a bit ugly... What if someone is sitting in
-         * it? We want to make sure the inode is removed completely as
-         * a result of the iput in process_vote. */
-        if (S_ISDIR(inode->i_mode) && (atomic_read(&inode->i_count) != 1)) {
-                mlog(0, "i_count = %u\n", atomic_read(&inode->i_count));
-                goto done;
-        }
-        if (filemap_fdatawrite(inode->i_mapping)) {
-                mlog(ML_ERROR, "Could not sync inode %llu for delete!\n",
-                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
-                goto done;
-        }
-        sync_mapping_buffers(inode->i_mapping);
-        truncate_inode_pages(inode->i_mapping, 0);
-        ocfs2_extent_map_trunc(inode, 0);
-        spin_lock(&OCFS2_I(inode)->ip_lock);
-        /* double check open count - someone might have raced this
-         * thread into ocfs2_file_open while we were writing out
-         * data. If we're to allow a wipe of this inode now, we *must*
-         * hold the spinlock until we've marked it. */
-        if (OCFS2_I(inode)->ip_open_count) {
-                mlog(0, "Raced to wipe! open count = %u\n",
-                     OCFS2_I(inode)->ip_open_count);
-                spin_unlock(&OCFS2_I(inode)->ip_lock);
-                goto done;
-        }
-        /* Mark the inode as being wiped from disk. */
-        ocfs2_mark_inode_remotely_deleted(inode);
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
-        /* Not sure this is necessary anymore. */
-        d_prune_aliases(inode);
-        /* If we get here, then we're voting 'yes', so commit the
-         * delete on our side. */
-        response = OCFS2_RESPONSE_OK;
-done:
-        return response;
-}
 static void ocfs2_process_vote(struct ocfs2_super *osb,
                               struct ocfs2_vote_msg *msg)
 {
        int net_status, vote_response;
-        int orphaned_slot = 0;
+        unsigned int node_num;
-        unsigned int node_num, generation;
        u64 blkno;
        enum ocfs2_vote_request request;
-        struct inode *inode = NULL;
        struct ocfs2_msg_hdr *hdr = &msg->v_hdr;
        struct ocfs2_response_msg response;
        /* decode the network mumbo jumbo into local variables. */
        request = be32_to_cpu(hdr->h_request);
        blkno = be64_to_cpu(hdr->h_blkno);
-        generation = be32_to_cpu(hdr->h_generation);
        node_num = be32_to_cpu(hdr->h_node_num);
-        if (request == OCFS2_VOTE_REQ_DELETE)
-                orphaned_slot = be32_to_cpu(msg->md1.v_orphaned_slot);
-        mlog(0, "processing vote: request = %u, blkno = %llu, "
+        mlog(0, "processing vote: request = %u, blkno = %llu, node_num = %u\n",
-             "generation = %u, node_num = %u, priv1 = %u\n", request,
+             request, (unsigned long long)blkno, node_num);
-             (unsigned long long)blkno, generation, node_num,
-             be32_to_cpu(msg->md1.v_generic1));
        if (!ocfs2_is_valid_vote_request(request)) {
                mlog(ML_ERROR, "Invalid vote request %d from node %u\n",
@@ -302,52 +181,6 @@ static void ocfs2_process_vote(struct ocfs2_super *osb,
                break;
        }
-        /* We cannot process the remaining message types before we're
-         * fully mounted. It's perfectly safe however to send a 'yes'
-         * response as we can't possibly have any of the state they're
-         * asking us to modify yet. */
-        if (atomic_read(&osb->vol_state) == VOLUME_INIT)
-                goto respond;
-        /* If we get here, then the request is against an inode. */
-        inode = ocfs2_ilookup_for_vote(osb, blkno,
-                                       request == OCFS2_VOTE_REQ_DELETE);
-        /* Not finding the inode is perfectly valid - it means we're
-         * not interested in what the other node is about to do to it
-         * so in those cases we automatically respond with an
-         * affirmative. Cluster locking ensures that we won't race
-         * interest in the inode with this vote request. */
-        if (!inode)
-                goto respond;
-        /* Check generation values. It's possible for us to get a
-         * request against a stale inode. If so then we proceed as if
-         * we had not found an inode in the first place. */
-        if (inode->i_generation != generation) {
-                mlog(0, "generation passed %u != inode generation = %u, "
-                     "ip_flags = %x, ip_blkno = %llu, msg %llu, i_count = %u, "
-                     "message type = %u\n", generation, inode->i_generation,
-                     OCFS2_I(inode)->ip_flags,
-                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                     (unsigned long long)blkno, atomic_read(&inode->i_count),
-                     request);
-                iput(inode);
-                inode = NULL;
-                goto respond;
-        }
-        switch (request) {
-        case OCFS2_VOTE_REQ_DELETE:
-                vote_response = ocfs2_process_delete_request(inode,
-                                                             &orphaned_slot);
-                break;
-        default:
-                mlog(ML_ERROR, "node %u, invalid request: %u\n",
-                     node_num, request);
-                vote_response = OCFS2_RESPONSE_BAD_MSG;
-        }
 respond:
        /* Response struture is small so we just put it on the stack
         * and stuff it inline. */
@@ -357,7 +190,6 @@ respond:
        response.r_hdr.h_generation = hdr->h_generation;
        response.r_hdr.h_node_num = cpu_to_be32(osb->node_num);
        response.r_response = cpu_to_be32(vote_response);
-        response.r_orphaned_slot = cpu_to_be32(orphaned_slot);
        net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE,
                                        osb->net_key,
@@ -373,9 +205,6 @@ respond:
            && net_status != -ENOTCONN)
                mlog(ML_ERROR, "message to node %u fails with error %d!\n",
                     node_num, net_status);
-        if (inode)
-                iput(inode);
 }
 static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb)
@@ -634,8 +463,7 @@ bail:
 static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
                                                      u64 blkno,
                                                      unsigned int generation,
-                                                      enum ocfs2_vote_request type,
+                                                      enum ocfs2_vote_request type)
-                                                      u32 priv)
 {
        struct ocfs2_vote_msg *request;
        struct ocfs2_msg_hdr *hdr;
@@ -651,8 +479,6 @@ static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
                hdr->h_request = cpu_to_be32(type);
                hdr->h_blkno = cpu_to_be64(blkno);
                hdr->h_generation = cpu_to_be32(generation);
-                request->md1.v_generic1 = cpu_to_be32(priv);
        }
        return request;
@@ -664,7 +490,7 @@ static int ocfs2_do_request_vote(struct ocfs2_super *osb,
                                 struct ocfs2_vote_msg *request,
                                 struct ocfs2_net_response_cb *callback)
 {
-        int status, response;
+        int status, response = -EBUSY;
        unsigned int response_id;
        struct ocfs2_msg_hdr *hdr;
@@ -686,109 +512,12 @@ bail:
        return status;
 }
-static int ocfs2_request_vote(struct inode *inode,
-                              struct ocfs2_vote_msg *request,
-                              struct ocfs2_net_response_cb *callback)
-{
-        int status;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        if (ocfs2_inode_is_new(inode))
-                return 0;
-        status = -EAGAIN;
-        while (status == -EAGAIN) {
-                if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
-                    signal_pending(current))
-                        return -ERESTARTSYS;
-                status = ocfs2_super_lock(osb, 0);
-                if (status < 0) {
-                        mlog_errno(status);
-                        break;
-                }
-                status = 0;
-                if (!ocfs2_node_map_is_only(osb, &osb->mounted_map,
-                                           osb->node_num))
-                        status = ocfs2_do_request_vote(osb, request, callback);
-                ocfs2_super_unlock(osb, 0);
-        }
-        return status;
-}
-static void ocfs2_delete_response_cb(void *priv,
-                                     struct ocfs2_response_msg *resp)
-{
-        int orphaned_slot, node;
-        struct inode *inode = priv;
-        orphaned_slot = be32_to_cpu(resp->r_orphaned_slot);
-        node = be32_to_cpu(resp->r_hdr.h_node_num);
-        mlog(0, "node %d tells us that inode %llu is orphaned in slot %d\n",
-             node, (unsigned long long)OCFS2_I(inode)->ip_blkno,
-             orphaned_slot);
-        /* The other node may not actually know which slot the inode
-         * is orphaned in. */
-        if (orphaned_slot == OCFS2_INVALID_SLOT)
-                return;
-        /* Ok, the responding node knows which slot this inode is
-         * orphaned in. We verify that the information is correct and
-         * then record this in the inode. ocfs2_delete_inode will use
-         * this information to determine which lock to take. */
-        spin_lock(&OCFS2_I(inode)->ip_lock);
-        mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != orphaned_slot &&
-                        OCFS2_I(inode)->ip_orphaned_slot
-                        != OCFS2_INVALID_SLOT, "Inode %llu: Node %d says it's "
-                        "orphaned in slot %d, we think it's in %d\n",
-                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                        be32_to_cpu(resp->r_hdr.h_node_num),
-                        orphaned_slot, OCFS2_I(inode)->ip_orphaned_slot);
-        OCFS2_I(inode)->ip_orphaned_slot = orphaned_slot;
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
-}
-int ocfs2_request_delete_vote(struct inode *inode)
-{
-        int orphaned_slot, status;
-        struct ocfs2_net_response_cb delete_cb;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct ocfs2_vote_msg *request;
-        spin_lock(&OCFS2_I(inode)->ip_lock);
-        orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
-        delete_cb.rc_cb = ocfs2_delete_response_cb;
-        delete_cb.rc_priv = inode;
-        mlog(0, "Inode %llu, we start thinking orphaned slot is %d\n",
-             (unsigned long long)OCFS2_I(inode)->ip_blkno, orphaned_slot);
-        status = -ENOMEM;
-        request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
-                                         inode->i_generation,
-                                         OCFS2_VOTE_REQ_DELETE, orphaned_slot);
-        if (request) {
-                status = ocfs2_request_vote(inode, request, &delete_cb);
-                kfree(request);
-        }
-        return status;
-}
 int ocfs2_request_mount_vote(struct ocfs2_super *osb)
 {
        int status;
        struct ocfs2_vote_msg *request = NULL;
-        request = ocfs2_new_vote_request(osb, 0ULL, 0,
+        request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_MOUNT);
-                                         OCFS2_VOTE_REQ_MOUNT, 0);
        if (!request) {
                status = -ENOMEM;
                goto bail;
@@ -821,8 +550,7 @@ int ocfs2_request_umount_vote(struct ocfs2_super *osb)
        int status;
        struct ocfs2_vote_msg *request = NULL;
-        request = ocfs2_new_vote_request(osb, 0ULL, 0,
+        request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_UMOUNT);
-                                         OCFS2_VOTE_REQ_UMOUNT, 0);
        if (!request) {
                status = -ENOMEM;
                goto bail;
@@ -969,7 +697,6 @@ static int ocfs2_handle_vote_message(struct o2net_msg *msg,
             be32_to_cpu(work->w_msg.v_hdr.h_generation));
        mlog(0, "h_node_num = %u\n",
             be32_to_cpu(work->w_msg.v_hdr.h_node_num));
-        mlog(0, "v_generic1 = %u\n", be32_to_cpu(work->w_msg.md1.v_generic1));
        spin_lock(&osb->vote_task_lock);
        list_add_tail(&work->w_list, &osb->vote_list);
diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/vote.h
index 53ebc1c69e56..9ea46f62de31 100644
--- a/fs/ocfs2/vote.h
+++ b/fs/ocfs2/vote.h
@@ -38,14 +38,11 @@ static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb)
        wake_up(&osb->vote_event);
 }
-int ocfs2_request_delete_vote(struct inode *inode);
 int ocfs2_request_mount_vote(struct ocfs2_super *osb);
 int ocfs2_request_umount_vote(struct ocfs2_super *osb);
 int ocfs2_register_net_handlers(struct ocfs2_super *osb);
 void ocfs2_unregister_net_handlers(struct ocfs2_super *osb);
-void ocfs2_mark_inode_remotely_deleted(struct inode *inode);
 void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
                                        int node_num);
 #endif
diff --git a/fs/sync.c b/fs/sync.c
index d0feff61e6aa..5cb9e7e43383 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -239,13 +239,11 @@ out:
 /*
 * `endbyte' is inclusive
 */
-int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte,
+int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
-                        unsigned int flags)
+                          loff_t endbyte, unsigned int flags)
 {
        int ret;
-        struct address_space *mapping;
-        mapping = file->f_mapping;
        if (!mapping) {
                ret = -EINVAL;
                goto out;
@@ -275,4 +273,4 @@ int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte,
 out:
        return ret;
 }
-EXPORT_SYMBOL_GPL(do_sync_file_range);
+EXPORT_SYMBOL_GPL(do_sync_mapping_range);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 86ec3f4a7da6..095a9c9a64fb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -843,8 +843,13 @@ extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg);
 extern int fcntl_getlease(struct file *filp);
 /* fs/sync.c */
-extern int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte,
+extern int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
-                        unsigned int flags);
+                        loff_t endbyte, unsigned int flags);
+static inline int do_sync_file_range(struct file *file, loff_t offset,
+                        loff_t endbyte, unsigned int flags)
+{
+        return do_sync_mapping_range(file->f_mapping, offset, endbyte, flags);
+}
 /* fs/locks.c */
 extern void locks_init_lock(struct file_lock *);
diff --git a/include/linux/mtd/ubi.h b/include/linux/mtd/ubi.h
new file mode 100644
index 000000000000..3d967b6b120a
--- /dev/null
+++ b/include/linux/mtd/ubi.h
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Artem Bityutskiy (Битюцкий Артём)
+ */
+#ifndef __LINUX_UBI_H__
+#define __LINUX_UBI_H__
+#include <asm/ioctl.h>
+#include <linux/types.h>
+#include <mtd/ubi-user.h>
+/*
+ * UBI data type hint constants.
+ *
+ * UBI_LONGTERM: long-term data
+ * UBI_SHORTTERM: short-term data
+ * UBI_UNKNOWN: data persistence is unknown
+ *
+ * These constants are used when data is written to UBI volumes in order to
+ * help the UBI wear-leveling unit to find more appropriate physical
+ * eraseblocks.
+ */
+enum {
+        UBI_LONGTERM = 1,
+        UBI_SHORTTERM,
+        UBI_UNKNOWN
+};
+/*
+ * enum ubi_open_mode - UBI volume open mode constants.
+ *
+ * UBI_READONLY: read-only mode
+ * UBI_READWRITE: read-write mode
+ * UBI_EXCLUSIVE: exclusive mode
+ */
+enum {
+        UBI_READONLY = 1,
+        UBI_READWRITE,
+        UBI_EXCLUSIVE
+};
+/**
+ * struct ubi_volume_info - UBI volume description data structure.
+ * @vol_id: volume ID
+ * @ubi_num: UBI device number this volume belongs to
+ * @size: how many physical eraseblocks are reserved for this volume
+ * @used_bytes: how many bytes of data this volume contains
+ * @used_ebs: how many physical eraseblocks of this volume actually contain any
+ * data
+ * @vol_type: volume type (%UBI_DYNAMIC_VOLUME or %UBI_STATIC_VOLUME)
+ * @corrupted: non-zero if the volume is corrupted (static volumes only)
+ * @upd_marker: non-zero if the volume has update marker set
+ * @alignment: volume alignment
+ * @usable_leb_size: how many bytes are available in logical eraseblocks of
+ * this volume
+ * @name_len: volume name length
+ * @name: volume name
+ * @cdev: UBI volume character device major and minor numbers
+ *
+ * The @corrupted flag is only relevant to static volumes and is always zero
+ * for dynamic ones. This is because UBI does not care about dynamic volume
+ * data protection and only cares about protecting static volume data.
+ *
+ * The @upd_marker flag is set if the volume update operation was interrupted.
+ * Before touching the volume data during the update operation, UBI first sets
+ * the update marker flag for this volume. If the volume update operation was
+ * further interrupted, the update marker indicates this. If the update marker
+ * is set, the contents of the volume is certainly damaged and a new volume
+ * update operation has to be started.
+ *
+ * To put it differently, @corrupted and @upd_marker fields have different
+ * semantics:
+ *     o the @corrupted flag means that this static volume is corrupted for some
+ *       reasons, but not because an interrupted volume update
+ *     o the @upd_marker field means that the volume is damaged because of an
+ *       interrupted update operation.
+ *
+ * I.e., the @corrupted flag is never set if the @upd_marker flag is set.
+ *
+ * The @used_bytes and @used_ebs fields are only really needed for static
+ * volumes and contain the number of bytes stored in this static volume and how
+ * many eraseblock this data occupies. In case of dynamic volumes, the
+ * @used_bytes field is equivalent to @size*@usable_leb_size, and the @used_ebs
+ * field is equivalent to @size.
+ *
+ * In general, logical eraseblock size is a property of the UBI device, not
+ * of the UBI volume. Indeed, the logical eraseblock size depends on the
+ * physical eraseblock size and on how much bytes UBI headers consume. But
+ * because of the volume alignment (@alignment), the usable size of logical
+ * eraseblocks if a volume may be less. The following equation is true:
+ *      @usable_leb_size = LEB size - (LEB size mod @alignment),
+ * where LEB size is the logical eraseblock size defined by the UBI device.
+ *
+ * The alignment is multiple to the minimal flash input/output unit size or %1
+ * if all the available space is used.
+ *
+ * To put this differently, alignment may be considered is a way to change
+ * volume logical eraseblock sizes.
+ */
+struct ubi_volume_info {
+        int ubi_num;
+        int vol_id;
+        int size;
+        long long used_bytes;
+        int used_ebs;
+        int vol_type;
+        int corrupted;
+        int upd_marker;
+        int alignment;
+        int usable_leb_size;
+        int name_len;
+        const char *name;
+        dev_t cdev;
+};
+/**
+ * struct ubi_device_info - UBI device description data structure.
+ * @ubi_num: ubi device number
+ * @leb_size: logical eraseblock size on this UBI device
+ * @min_io_size: minimal I/O unit size
+ * @ro_mode: if this device is in read-only mode
+ * @cdev: UBI character device major and minor numbers
+ *
+ * Note, @leb_size is the logical eraseblock size offered by the UBI device.
+ * Volumes of this UBI device may have smaller logical eraseblock size if their
+ * alignment is not equivalent to %1.
+ */
+struct ubi_device_info {
+        int ubi_num;
+        int leb_size;
+        int min_io_size;
+        int ro_mode;
+        dev_t cdev;
+};
+/* UBI descriptor given to users when they open UBI volumes */
+struct ubi_volume_desc;
+int ubi_get_device_info(int ubi_num, struct ubi_device_info *di);
+void ubi_get_volume_info(struct ubi_volume_desc *desc,
+                         struct ubi_volume_info *vi);
+struct ubi_volume_desc *ubi_open_volume(int ubi_num, int vol_id, int mode);
+struct ubi_volume_desc *ubi_open_volume_nm(int ubi_num, const char *name,
+                                           int mode);
+void ubi_close_volume(struct ubi_volume_desc *desc);
+int ubi_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
+                 int len, int check);
+int ubi_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
+                  int offset, int len, int dtype);
+int ubi_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf,
+                   int len, int dtype);
+int ubi_leb_erase(struct ubi_volume_desc *desc, int lnum);
+int ubi_leb_unmap(struct ubi_volume_desc *desc, int lnum);
+int ubi_is_mapped(struct ubi_volume_desc *desc, int lnum);
+/*
+ * This function is the same as the 'ubi_leb_read()' function, but it does not
+ * provide the checking capability.
+ */
+static inline int ubi_read(struct ubi_volume_desc *desc, int lnum, char *buf,
+                           int offset, int len)
+{
+        return ubi_leb_read(desc, lnum, buf, offset, len, 0);
+}
+/*
+ * This function is the same as the 'ubi_leb_write()' functions, but it does
+ * not have the data type argument.
+ */
+static inline int ubi_write(struct ubi_volume_desc *desc, int lnum,
+                            const void *buf, int offset, int len)
+{
+        return ubi_leb_write(desc, lnum, buf, offset, len, UBI_UNKNOWN);
+}
+/*
+ * This function is the same as the 'ubi_leb_change()' functions, but it does
+ * not have the data type argument.
+ */
+static inline int ubi_change(struct ubi_volume_desc *desc, int lnum,
+                                    const void *buf, int len)
+{
+        return ubi_leb_change(desc, lnum, buf, len, UBI_UNKNOWN);
+}
+#endif /* !__LINUX_UBI_H__ */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 49fe2997a016..a1707583de49 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -196,13 +196,13 @@ extern void init_idle(struct task_struct *idle, int cpu);
 extern cpumask_t nohz_cpu_mask;
 /*
- * Only dump TASK_* tasks. (-1 for all tasks)
+ * Only dump TASK_* tasks. (0 for all tasks)
 */
 extern void show_state_filter(unsigned long state_filter);
 static inline void show_state(void)
 {
-        show_state_filter(-1);
+        show_state_filter(0);
 }
 extern void show_regs(struct pt_regs *);
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 52c9eb9b6df2..26e4925bc35b 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -61,10 +61,10 @@ static inline void write_seqlock(seqlock_t *sl)
 {
        spin_lock(&sl->lock);
        ++sl->sequence;
-        smp_wmb();                      
+        smp_wmb();
-}       
+}
-static inline void write_sequnlock(seqlock_t *sl) 
+static inline void write_sequnlock(seqlock_t *sl)
 {
        smp_wmb();
        sl->sequence++;
@@ -77,7 +77,7 @@ static inline int write_tryseqlock(seqlock_t *sl)
        if (ret) {
                ++sl->sequence;
-                smp_wmb();                      
+                smp_wmb();
        }
        return ret;
 }
diff --git a/include/mtd/Kbuild b/include/mtd/Kbuild
index e0fe92b03a4e..4d46b3bdebd8 100644
--- a/include/mtd/Kbuild
+++ b/include/mtd/Kbuild
@@ -3,3 +3,5 @@ header-y += jffs2-user.h
 header-y += mtd-abi.h
 header-y += mtd-user.h
 header-y += nftl-user.h
+header-y += ubi-header.h
+header-y += ubi-user.h
diff --git a/include/mtd/mtd-abi.h b/include/mtd/mtd-abi.h
index 8e501a75a764..f71dac420394 100644
--- a/include/mtd/mtd-abi.h
+++ b/include/mtd/mtd-abi.h
@@ -24,6 +24,7 @@ struct mtd_oob_buf {
 #define MTD_NORFLASH            3
 #define MTD_NANDFLASH           4
 #define MTD_DATAFLASH           6
+#define MTD_UBIVOLUME           7
 #define MTD_WRITEABLE           0x400   /* Device is writeable */
 #define MTD_BIT_WRITEABLE       0x800   /* Single bits can be flipped */
diff --git a/include/mtd/ubi-header.h b/include/mtd/ubi-header.h
new file mode 100644
index 000000000000..fa479c71aa34
--- /dev/null
+++ b/include/mtd/ubi-header.h
@@ -0,0 +1,360 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Thomas Gleixner
+ *          Frank Haverkamp
+ *          Oliver Lohmann
+ *          Andreas Arnez
+ */
+/*
+ * This file defines the layout of UBI headers and all the other UBI on-flash
+ * data structures. May be included by user-space.
+ */
+#ifndef __UBI_HEADER_H__
+#define __UBI_HEADER_H__
+#include <asm/byteorder.h>
+/* The version of UBI images supported by this implementation */
+#define UBI_VERSION 1
+/* The highest erase counter value supported by this implementation */
+#define UBI_MAX_ERASECOUNTER 0x7FFFFFFF
+/* The initial CRC32 value used when calculating CRC checksums */
+#define UBI_CRC32_INIT 0xFFFFFFFFU
+/* Erase counter header magic number (ASCII "UBI#") */
+#define UBI_EC_HDR_MAGIC  0x55424923
+/* Volume identifier header magic number (ASCII "UBI!") */
+#define UBI_VID_HDR_MAGIC 0x55424921
+/*
+ * Volume type constants used in the volume identifier header.
+ *
+ * @UBI_VID_DYNAMIC: dynamic volume
+ * @UBI_VID_STATIC: static volume
+ */
+enum {
+        UBI_VID_DYNAMIC = 1,
+        UBI_VID_STATIC  = 2
+};
+/*
+ * Compatibility constants used by internal volumes.
+ *
+ * @UBI_COMPAT_DELETE: delete this internal volume before anything is written
+ * to the flash
+ * @UBI_COMPAT_RO: attach this device in read-only mode
+ * @UBI_COMPAT_PRESERVE: preserve this internal volume - do not touch its
+ * physical eraseblocks, don't allow the wear-leveling unit to move them
+ * @UBI_COMPAT_REJECT: reject this UBI image
+ */
+enum {
+        UBI_COMPAT_DELETE   = 1,
+        UBI_COMPAT_RO       = 2,
+        UBI_COMPAT_PRESERVE = 4,
+        UBI_COMPAT_REJECT   = 5
+};
+/*
+ * ubi16_t/ubi32_t/ubi64_t - 16, 32, and 64-bit integers used in UBI on-flash
+ * data structures.
+ */
+typedef struct {
+        uint16_t int16;
+} __attribute__ ((packed)) ubi16_t;
+typedef struct {
+        uint32_t int32;
+} __attribute__ ((packed)) ubi32_t;
+typedef struct {
+        uint64_t int64;
+} __attribute__ ((packed)) ubi64_t;
+/*
+ * In this implementation of UBI uses the big-endian format for on-flash
+ * integers. The below are the corresponding conversion macros.
+ */
+#define cpu_to_ubi16(x) ((ubi16_t){__cpu_to_be16(x)})
+#define ubi16_to_cpu(x) ((uint16_t)__be16_to_cpu((x).int16))
+#define cpu_to_ubi32(x) ((ubi32_t){__cpu_to_be32(x)})
+#define ubi32_to_cpu(x) ((uint32_t)__be32_to_cpu((x).int32))
+#define cpu_to_ubi64(x) ((ubi64_t){__cpu_to_be64(x)})
+#define ubi64_to_cpu(x) ((uint64_t)__be64_to_cpu((x).int64))
+/* Sizes of UBI headers */
+#define UBI_EC_HDR_SIZE  sizeof(struct ubi_ec_hdr)
+#define UBI_VID_HDR_SIZE sizeof(struct ubi_vid_hdr)
+/* Sizes of UBI headers without the ending CRC */
+#define UBI_EC_HDR_SIZE_CRC  (UBI_EC_HDR_SIZE  - sizeof(ubi32_t))
+#define UBI_VID_HDR_SIZE_CRC (UBI_VID_HDR_SIZE - sizeof(ubi32_t))
+/**
+ * struct ubi_ec_hdr - UBI erase counter header.
+ * @magic: erase counter header magic number (%UBI_EC_HDR_MAGIC)
+ * @version: version of UBI implementation which is supposed to accept this
+ * UBI image
+ * @padding1: reserved for future, zeroes
+ * @ec: the erase counter
+ * @vid_hdr_offset: where the VID header starts
+ * @data_offset: where the user data start
+ * @padding2: reserved for future, zeroes
+ * @hdr_crc: erase counter header CRC checksum
+ *
+ * The erase counter header takes 64 bytes and has a plenty of unused space for
+ * future usage. The unused fields are zeroed. The @version field is used to
+ * indicate the version of UBI implementation which is supposed to be able to
+ * work with this UBI image. If @version is greater then the current UBI
+ * version, the image is rejected. This may be useful in future if something
+ * is changed radically. This field is duplicated in the volume identifier
+ * header.
+ *
+ * The @vid_hdr_offset and @data_offset fields contain the offset of the the
+ * volume identifier header and user data, relative to the beginning of the
+ * physical eraseblock. These values have to be the same for all physical
+ * eraseblocks.
+ */
+struct ubi_ec_hdr {
+        ubi32_t magic;
+        uint8_t version;
+        uint8_t padding1[3];
+        ubi64_t ec; /* Warning: the current limit is 31-bit anyway! */
+        ubi32_t vid_hdr_offset;
+        ubi32_t data_offset;
+        uint8_t padding2[36];
+        ubi32_t hdr_crc;
+} __attribute__ ((packed));
+/**
+ * struct ubi_vid_hdr - on-flash UBI volume identifier header.
+ * @magic: volume identifier header magic number (%UBI_VID_HDR_MAGIC)
+ * @version: UBI implementation version which is supposed to accept this UBI
+ * image (%UBI_VERSION)
+ * @vol_type: volume type (%UBI_VID_DYNAMIC or %UBI_VID_STATIC)
+ * @copy_flag: if this logical eraseblock was copied from another physical
+ * eraseblock (for wear-leveling reasons)
+ * @compat: compatibility of this volume (%0, %UBI_COMPAT_DELETE,
+ * %UBI_COMPAT_IGNORE, %UBI_COMPAT_PRESERVE, or %UBI_COMPAT_REJECT)
+ * @vol_id: ID of this volume
+ * @lnum: logical eraseblock number
+ * @leb_ver: version of this logical eraseblock (IMPORTANT: obsolete, to be
+ * removed, kept only for not breaking older UBI users)
+ * @data_size: how many bytes of data this logical eraseblock contains
+ * @used_ebs: total number of used logical eraseblocks in this volume
+ * @data_pad: how many bytes at the end of this physical eraseblock are not
+ * used
+ * @data_crc: CRC checksum of the data stored in this logical eraseblock
+ * @padding1: reserved for future, zeroes
+ * @sqnum: sequence number
+ * @padding2: reserved for future, zeroes
+ * @hdr_crc: volume identifier header CRC checksum
+ *
+ * The @sqnum is the value of the global sequence counter at the time when this
+ * VID header was created. The global sequence counter is incremented each time
+ * UBI writes a new VID header to the flash, i.e. when it maps a logical
+ * eraseblock to a new physical eraseblock. The global sequence counter is an
+ * unsigned 64-bit integer and we assume it never overflows. The @sqnum
+ * (sequence number) is used to distinguish between older and newer versions of
+ * logical eraseblocks.
+ *
+ * There are 2 situations when there may be more then one physical eraseblock
+ * corresponding to the same logical eraseblock, i.e., having the same @vol_id
+ * and @lnum values in the volume identifier header. Suppose we have a logical
+ * eraseblock L and it is mapped to the physical eraseblock P.
+ *
+ * 1. Because UBI may erase physical eraseblocks asynchronously, the following
+ * situation is possible: L is asynchronously erased, so P is scheduled for
+ * erasure, then L is written to,i.e. mapped to another physical eraseblock P1,
+ * so P1 is written to, then an unclean reboot happens. Result - there are 2
+ * physical eraseblocks P and P1 corresponding to the same logical eraseblock
+ * L. But P1 has greater sequence number, so UBI picks P1 when it attaches the
+ * flash.
+ *
+ * 2. From time to time UBI moves logical eraseblocks to other physical
+ * eraseblocks for wear-leveling reasons. If, for example, UBI moves L from P
+ * to P1, and an unclean reboot happens before P is physically erased, there
+ * are two physical eraseblocks P and P1 corresponding to L and UBI has to
+ * select one of them when the flash is attached. The @sqnum field says which
+ * PEB is the original (obviously P will have lower @sqnum) and the copy. But
+ * it is not enough to select the physical eraseblock with the higher sequence
+ * number, because the unclean reboot could have happen in the middle of the
+ * copying process, so the data in P is corrupted. It is also not enough to
+ * just select the physical eraseblock with lower sequence number, because the
+ * data there may be old (consider a case if more data was added to P1 after
+ * the copying). Moreover, the unclean reboot may happen when the erasure of P
+ * was just started, so it result in unstable P, which is "mostly" OK, but
+ * still has unstable bits.
+ *
+ * UBI uses the @copy_flag field to indicate that this logical eraseblock is a
+ * copy. UBI also calculates data CRC when the data is moved and stores it at
+ * the @data_crc field of the copy (P1). So when UBI needs to pick one physical
+ * eraseblock of two (P or P1), the @copy_flag of the newer one (P1) is
+ * examined. If it is cleared, the situation* is simple and the newer one is
+ * picked. If it is set, the data CRC of the copy (P1) is examined. If the CRC
+ * checksum is correct, this physical eraseblock is selected (P1). Otherwise
+ * the older one (P) is selected.
+ *
+ * Note, there is an obsolete @leb_ver field which was used instead of @sqnum
+ * in the past. But it is not used anymore and we keep it in order to be able
+ * to deal with old UBI images. It will be removed at some point.
+ *
+ * There are 2 sorts of volumes in UBI: user volumes and internal volumes.
+ * Internal volumes are not seen from outside and are used for various internal
+ * UBI purposes. In this implementation there is only one internal volume - the
+ * layout volume. Internal volumes are the main mechanism of UBI extensions.
+ * For example, in future one may introduce a journal internal volume. Internal
+ * volumes have their own reserved range of IDs.
+ *
+ * The @compat field is only used for internal volumes and contains the "degree
+ * of their compatibility". It is always zero for user volumes. This field
+ * provides a mechanism to introduce UBI extensions and to be still compatible
+ * with older UBI binaries. For example, if someone introduced a journal in
+ * future, he would probably use %UBI_COMPAT_DELETE compatibility for the
+ * journal volume.  And in this case, older UBI binaries, which know nothing
+ * about the journal volume, would just delete this volume and work perfectly
+ * fine. This is similar to what Ext2fs does when it is fed by an Ext3fs image
+ * - it just ignores the Ext3fs journal.
+ *
+ * The @data_crc field contains the CRC checksum of the contents of the logical
+ * eraseblock if this is a static volume. In case of dynamic volumes, it does
+ * not contain the CRC checksum as a rule. The only exception is when the
+ * data of the physical eraseblock was moved by the wear-leveling unit, then
+ * the wear-leveling unit calculates the data CRC and stores it in the
+ * @data_crc field. And of course, the @copy_flag is %in this case.
+ *
+ * The @data_size field is used only for static volumes because UBI has to know
+ * how many bytes of data are stored in this eraseblock. For dynamic volumes,
+ * this field usually contains zero. The only exception is when the data of the
+ * physical eraseblock was moved to another physical eraseblock for
+ * wear-leveling reasons. In this case, UBI calculates CRC checksum of the
+ * contents and uses both @data_crc and @data_size fields. In this case, the
+ * @data_size field contains data size.
+ *
+ * The @used_ebs field is used only for static volumes and indicates how many
+ * eraseblocks the data of the volume takes. For dynamic volumes this field is
+ * not used and always contains zero.
+ *
+ * The @data_pad is calculated when volumes are created using the alignment
+ * parameter. So, effectively, the @data_pad field reduces the size of logical
+ * eraseblocks of this volume. This is very handy when one uses block-oriented
+ * software (say, cramfs) on top of the UBI volume.
+ */
+struct ubi_vid_hdr {
+        ubi32_t magic;
+        uint8_t version;
+        uint8_t vol_type;
+        uint8_t copy_flag;
+        uint8_t compat;
+        ubi32_t vol_id;
+        ubi32_t lnum;
+        ubi32_t leb_ver; /* obsolete, to be removed, don't use */
+        ubi32_t data_size;
+        ubi32_t used_ebs;
+        ubi32_t data_pad;
+        ubi32_t data_crc;
+        uint8_t padding1[4];
+        ubi64_t sqnum;
+        uint8_t padding2[12];
+        ubi32_t hdr_crc;
+} __attribute__ ((packed));
+/* Internal UBI volumes count */
+#define UBI_INT_VOL_COUNT 1
+/*
+ * Starting ID of internal volumes. There is reserved room for 4096 internal
+ * volumes.
+ */
+#define UBI_INTERNAL_VOL_START (0x7FFFFFFF - 4096)
+/* The layout volume contains the volume table */
+#define UBI_LAYOUT_VOL_ID        UBI_INTERNAL_VOL_START
+#define UBI_LAYOUT_VOLUME_EBS    2
+#define UBI_LAYOUT_VOLUME_NAME   "layout volume"
+#define UBI_LAYOUT_VOLUME_COMPAT UBI_COMPAT_REJECT
+/* The maximum number of volumes per one UBI device */
+#define UBI_MAX_VOLUMES 128
+/* The maximum volume name length */
+#define UBI_VOL_NAME_MAX 127
+/* Size of the volume table record */
+#define UBI_VTBL_RECORD_SIZE sizeof(struct ubi_vtbl_record)
+/* Size of the volume table record without the ending CRC */
+#define UBI_VTBL_RECORD_SIZE_CRC (UBI_VTBL_RECORD_SIZE - sizeof(ubi32_t))
+/**
+ * struct ubi_vtbl_record - a record in the volume table.
+ * @reserved_pebs: how many physical eraseblocks are reserved for this volume
+ * @alignment: volume alignment
+ * @data_pad: how many bytes are unused at the end of the each physical
+ * eraseblock to satisfy the requested alignment
+ * @vol_type: volume type (%UBI_DYNAMIC_VOLUME or %UBI_STATIC_VOLUME)
+ * @upd_marker: if volume update was started but not finished
+ * @name_len: volume name length
+ * @name: the volume name
+ * @padding2: reserved, zeroes
+ * @crc: a CRC32 checksum of the record
+ *
+ * The volume table records are stored in the volume table, which is stored in
+ * the layout volume. The layout volume consists of 2 logical eraseblock, each
+ * of which contains a copy of the volume table (i.e., the volume table is
+ * duplicated). The volume table is an array of &struct ubi_vtbl_record
+ * objects indexed by the volume ID.
+ *
+ * If the size of the logical eraseblock is large enough to fit
+ * %UBI_MAX_VOLUMES records, the volume table contains %UBI_MAX_VOLUMES
+ * records. Otherwise, it contains as many records as it can fit (i.e., size of
+ * logical eraseblock divided by sizeof(struct ubi_vtbl_record)).
+ *
+ * The @upd_marker flag is used to implement volume update. It is set to %1
+ * before update and set to %0 after the update. So if the update operation was
+ * interrupted, UBI knows that the volume is corrupted.
+ *
+ * The @alignment field is specified when the volume is created and cannot be
+ * later changed. It may be useful, for example, when a block-oriented file
+ * system works on top of UBI. The @data_pad field is calculated using the
+ * logical eraseblock size and @alignment. The alignment must be multiple to the
+ * minimal flash I/O unit. If @alignment is 1, all the available space of
+ * the physical eraseblocks is used.
+ *
+ * Empty records contain all zeroes and the CRC checksum of those zeroes.
+ */
+struct ubi_vtbl_record {
+        ubi32_t reserved_pebs;
+        ubi32_t alignment;
+        ubi32_t data_pad;
+        uint8_t vol_type;
+        uint8_t upd_marker;
+        ubi16_t name_len;
+        uint8_t name[UBI_VOL_NAME_MAX+1];
+        uint8_t padding2[24];
+        ubi32_t crc;
+} __attribute__ ((packed));
+#endif /* !__UBI_HEADER_H__ */
diff --git a/include/mtd/ubi-user.h b/include/mtd/ubi-user.h
new file mode 100644
index 000000000000..fe06ded0e6b8
--- /dev/null
+++ b/include/mtd/ubi-user.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Artem Bityutskiy (Битюцкий Артём)
+ */
+#ifndef __UBI_USER_H__
+#define __UBI_USER_H__
+/*
+ * UBI volume creation
+ * ~~~~~~~~~~~~~~~~~~~
+ *
+ * UBI volumes are created via the %UBI_IOCMKVOL IOCTL command of UBI character
+ * device. A &struct ubi_mkvol_req object has to be properly filled and a
+ * pointer to it has to be passed to the IOCTL.
+ *
+ * UBI volume deletion
+ * ~~~~~~~~~~~~~~~~~~~
+ *
+ * To delete a volume, the %UBI_IOCRMVOL IOCTL command of the UBI character
+ * device should be used. A pointer to the 32-bit volume ID hast to be passed
+ * to the IOCTL.
+ *
+ * UBI volume re-size
+ * ~~~~~~~~~~~~~~~~~~
+ *
+ * To re-size a volume, the %UBI_IOCRSVOL IOCTL command of the UBI character
+ * device should be used. A &struct ubi_rsvol_req object has to be properly
+ * filled and a pointer to it has to be passed to the IOCTL.
+ *
+ * UBI volume update
+ * ~~~~~~~~~~~~~~~~~
+ *
+ * Volume update should be done via the %UBI_IOCVOLUP IOCTL command of the
+ * corresponding UBI volume character device. A pointer to a 64-bit update
+ * size should be passed to the IOCTL. After then, UBI expects user to write
+ * this number of bytes to the volume character device. The update is finished
+ * when the claimed number of bytes is passed. So, the volume update sequence
+ * is something like:
+ *
+ * fd = open("/dev/my_volume");
+ * ioctl(fd, UBI_IOCVOLUP, &image_size);
+ * write(fd, buf, image_size);
+ * close(fd);
+ */
+/*
+ * When a new volume is created, users may either specify the volume number they
+ * want to create or to let UBI automatically assign a volume number using this
+ * constant.
+ */
+#define UBI_VOL_NUM_AUTO (-1)
+/* Maximum volume name length */
+#define UBI_MAX_VOLUME_NAME 127
+/* IOCTL commands of UBI character devices */
+#define UBI_IOC_MAGIC 'o'
+/* Create an UBI volume */
+#define UBI_IOCMKVOL _IOW(UBI_IOC_MAGIC, 0, struct ubi_mkvol_req)
+/* Remove an UBI volume */
+#define UBI_IOCRMVOL _IOW(UBI_IOC_MAGIC, 1, int32_t)
+/* Re-size an UBI volume */
+#define UBI_IOCRSVOL _IOW(UBI_IOC_MAGIC, 2, struct ubi_rsvol_req)
+/* IOCTL commands of UBI volume character devices */
+#define UBI_VOL_IOC_MAGIC 'O'
+/* Start UBI volume update */
+#define UBI_IOCVOLUP _IOW(UBI_VOL_IOC_MAGIC, 0, int64_t)
+/* An eraseblock erasure command, used for debugging, disabled by default */
+#define UBI_IOCEBER _IOW(UBI_VOL_IOC_MAGIC, 1, int32_t)
+/*
+ * UBI volume type constants.
+ *
+ * @UBI_DYNAMIC_VOLUME: dynamic volume
+ * @UBI_STATIC_VOLUME:  static volume
+ */
+enum {
+        UBI_DYNAMIC_VOLUME = 3,
+        UBI_STATIC_VOLUME = 4
+};
+/**
+ * struct ubi_mkvol_req - volume description data structure used in
+ * volume creation requests.
+ * @vol_id: volume number
+ * @alignment: volume alignment
+ * @bytes: volume size in bytes
+ * @vol_type: volume type (%UBI_DYNAMIC_VOLUME or %UBI_STATIC_VOLUME)
+ * @padding1: reserved for future, not used
+ * @name_len: volume name length
+ * @padding2: reserved for future, not used
+ * @name: volume name
+ *
+ * This structure is used by userspace programs when creating new volumes. The
+ * @used_bytes field is only necessary when creating static volumes.
+ *
+ * The @alignment field specifies the required alignment of the volume logical
+ * eraseblock. This means, that the size of logical eraseblocks will be aligned
+ * to this number, i.e.,
+ *      (UBI device logical eraseblock size) mod (@alignment) = 0.
+ *
+ * To put it differently, the logical eraseblock of this volume may be slightly
+ * shortened in order to make it properly aligned. The alignment has to be
+ * multiple of the flash minimal input/output unit, or %1 to utilize the entire
+ * available space of logical eraseblocks.
+ *
+ * The @alignment field may be useful, for example, when one wants to maintain
+ * a block device on top of an UBI volume. In this case, it is desirable to fit
+ * an integer number of blocks in logical eraseblocks of this UBI volume. With
+ * alignment it is possible to update this volume using plane UBI volume image
+ * BLOBs, without caring about how to properly align them.
+ */
+struct ubi_mkvol_req {
+        int32_t vol_id;
+        int32_t alignment;
+        int64_t bytes;
+        int8_t vol_type;
+        int8_t padding1;
+        int16_t name_len;
+        int8_t padding2[4];
+        char name[UBI_MAX_VOLUME_NAME+1];
+} __attribute__ ((packed));
+/**
+ * struct ubi_rsvol_req - a data structure used in volume re-size requests.
+ * @vol_id: ID of the volume to re-size
+ * @bytes: new size of the volume in bytes
+ *
+ * Re-sizing is possible for both dynamic and static volumes. But while dynamic
+ * volumes may be re-sized arbitrarily, static volumes cannot be made to be
+ * smaller then the number of bytes they bear. To arbitrarily shrink a static
+ * volume, it must be wiped out first (by means of volume update operation with
+ * zero number of bytes).
+ */
+struct ubi_rsvol_req {
+        int64_t bytes;
+        int32_t vol_id;
+} __attribute__ ((packed));
+#endif /* __UBI_USER_H__ */
diff --git a/kernel/sched.c b/kernel/sched.c
index b9a683730148..960d7c5fca39 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4746,7 +4746,7 @@ void show_state_filter(unsigned long state_filter)
                 * console might take alot of time:
                 */
                touch_nmi_watchdog();
-                if (p->state & state_filter)
+                if (!state_filter || (p->state & state_filter))
                        show_task(p);
        } while_each_thread(g, p);
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 11a3404d65af..e1f18489db1d 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -92,6 +92,33 @@ int cipso_v4_rbm_optfmt = 0;
 int cipso_v4_rbm_strictvalid = 1;
 /*
+ * Protocol Constants
+ */
+/* Maximum size of the CIPSO IP option, derived from the fact that the maximum
+ * IPv4 header size is 60 bytes and the base IPv4 header is 20 bytes long. */
+#define CIPSO_V4_OPT_LEN_MAX          40
+/* Length of the base CIPSO option, this includes the option type (1 byte), the
+ * option length (1 byte), and the DOI (4 bytes). */
+#define CIPSO_V4_HDR_LEN              6
+/* Base length of the restrictive category bitmap tag (tag #1). */
+#define CIPSO_V4_TAG_RBM_BLEN         4
+/* Base length of the enumerated category tag (tag #2). */
+#define CIPSO_V4_TAG_ENUM_BLEN        4
+/* Base length of the ranged categories bitmap tag (tag #5). */
+#define CIPSO_V4_TAG_RNG_BLEN         4
+/* The maximum number of category ranges permitted in the ranged category tag
+ * (tag #5).  You may note that the IETF draft states that the maximum number
+ * of category ranges is 7, but if the low end of the last category range is
+ * zero then it is possibile to fit 8 category ranges because the zero should
+ * be omitted. */
+#define CIPSO_V4_TAG_RNG_CAT_MAX      8
+/*
 * Helper Functions
 */
@@ -1109,16 +1136,15 @@ static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def,
                                     unsigned char *net_cat,
                                     u32 net_cat_len)
 {
-        /* The constant '16' is not random, it is the maximum number of
-         * high/low category range pairs as permitted by the CIPSO draft based
-         * on a maximum IPv4 header length of 60 bytes - the BUG_ON() assertion
-         * does a sanity check to make sure we don't overflow the array. */
        int iter = -1;
-        u16 array[16];
+        u16 array[CIPSO_V4_TAG_RNG_CAT_MAX * 2];
        u32 array_cnt = 0;
        u32 cat_size = 0;
-        BUG_ON(net_cat_len > 30);
+        /* make sure we don't overflow the 'array[]' variable */
+        if (net_cat_len >
+            (CIPSO_V4_OPT_LEN_MAX - CIPSO_V4_HDR_LEN - CIPSO_V4_TAG_RNG_BLEN))
+                return -ENOSPC;
        for (;;) {
                iter = netlbl_secattr_catmap_walk(secattr->mls_cat, iter + 1);
@@ -1196,9 +1222,6 @@ static int cipso_v4_map_cat_rng_ntoh(const struct cipso_v4_doi *doi_def,
 * Protocol Handling Functions
 */
-#define CIPSO_V4_OPT_LEN_MAX          40
-#define CIPSO_V4_HDR_LEN              6
 /**
 * cipso_v4_gentag_hdr - Generate a CIPSO option header
 * @doi_def: the DOI definition
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index e03a3282c551..f2535e7f2869 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -263,9 +263,6 @@ int netlbl_socket_setattr(const struct socket *sock,
        int ret_val = -ENOENT;
        struct netlbl_dom_map *dom_entry;
-        if ((secattr->flags & NETLBL_SECATTR_DOMAIN) == 0)
-                return -ENOENT;
        rcu_read_lock();
        dom_entry = netlbl_domhsh_getentry(secattr->domain);
        if (dom_entry == NULL)
diff --git a/security/selinux/Makefile b/security/selinux/Makefile
index faf2e02e4410..dc3502e30b19 100644
--- a/security/selinux/Makefile
+++ b/security/selinux/Makefile
@@ -8,5 +8,7 @@ selinux-y := avc.o hooks.o selinuxfs.o netlink.o nlmsgtab.o netif.o exports.o
 selinux-$(CONFIG_SECURITY_NETWORK_XFRM) += xfrm.o
+selinux-$(CONFIG_NETLABEL) += netlabel.o
 EXTRA_CFLAGS += -Isecurity/selinux/include
diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index da8caf10ef97..e4396a89edc6 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -217,6 +217,8 @@ static void avc_dump_query(struct audit_buffer *ab, u32 ssid, u32 tsid, u16 tcla
                audit_log_format(ab, " tcontext=%s", scontext);
                kfree(scontext);
        }
+        BUG_ON(tclass >= ARRAY_SIZE(class_to_string) || !class_to_string[tclass]);
        audit_log_format(ab, " tclass=%s", class_to_string[tclass]);
 }
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 5f02b4be1917..885a9a958b8d 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -77,7 +77,7 @@
 #include "objsec.h"
 #include "netif.h"
 #include "xfrm.h"
-#include "selinux_netlabel.h"
+#include "netlabel.h"
 #define XATTR_SELINUX_SUFFIX "selinux"
 #define XATTR_NAME_SELINUX XATTR_SECURITY_PREFIX XATTR_SELINUX_SUFFIX
@@ -3123,6 +3123,34 @@ static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad,
        return ret;
 }
+/**
+ * selinux_skb_extlbl_sid - Determine the external label of a packet
+ * @skb: the packet
+ * @base_sid: the SELinux SID to use as a context for MLS only external labels
+ * @sid: the packet's SID
+ *
+ * Description:
+ * Check the various different forms of external packet labeling and determine
+ * the external SID for the packet.
+ *
+ */
+static void selinux_skb_extlbl_sid(struct sk_buff *skb,
+                                   u32 base_sid,
+                                   u32 *sid)
+{
+        u32 xfrm_sid;
+        u32 nlbl_sid;
+        selinux_skb_xfrm_sid(skb, &xfrm_sid);
+        if (selinux_netlbl_skbuff_getsid(skb,
+                                         (xfrm_sid == SECSID_NULL ?
+                                          base_sid : xfrm_sid),
+                                         &nlbl_sid) != 0)
+                nlbl_sid = SECSID_NULL;
+        *sid = (nlbl_sid == SECSID_NULL ? xfrm_sid : nlbl_sid);
+}
 /* socket security operations */
 static int socket_has_perm(struct task_struct *task, struct socket *sock,
                           u32 perms)
@@ -3664,9 +3692,7 @@ static int selinux_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *
        if (sock && sock->sk->sk_family == PF_UNIX)
                selinux_get_inode_sid(SOCK_INODE(sock), &peer_secid);
        else if (skb)
-                security_skb_extlbl_sid(skb,
+                selinux_skb_extlbl_sid(skb, SECINITSID_UNLABELED, &peer_secid);
-                                        SECINITSID_UNLABELED,
-                                        &peer_secid);
        if (peer_secid == SECSID_NULL)
                err = -EINVAL;
@@ -3727,7 +3753,7 @@ static int selinux_inet_conn_request(struct sock *sk, struct sk_buff *skb,
        u32 newsid;
        u32 peersid;
-        security_skb_extlbl_sid(skb, SECINITSID_UNLABELED, &peersid);
+        selinux_skb_extlbl_sid(skb, SECINITSID_UNLABELED, &peersid);
        if (peersid == SECSID_NULL) {
                req->secid = sksec->sid;
                req->peer_secid = SECSID_NULL;
@@ -3765,7 +3791,7 @@ static void selinux_inet_conn_established(struct sock *sk,
 {
        struct sk_security_struct *sksec = sk->sk_security;
-        security_skb_extlbl_sid(skb, SECINITSID_UNLABELED, &sksec->peer_sid);
+        selinux_skb_extlbl_sid(skb, SECINITSID_UNLABELED, &sksec->peer_sid);
 }
 static void selinux_req_classify_flow(const struct request_sock *req,
diff --git a/security/selinux/include/av_perm_to_string.h b/security/selinux/include/av_perm_to_string.h
index ad9fb2d69b50..b83e74012a97 100644
--- a/security/selinux/include/av_perm_to_string.h
+++ b/security/selinux/include/av_perm_to_string.h
@@ -128,96 +128,6 @@
   S_(SECCLASS_CAPABILITY, CAPABILITY__LEASE, "lease")
   S_(SECCLASS_CAPABILITY, CAPABILITY__AUDIT_WRITE, "audit_write")
   S_(SECCLASS_CAPABILITY, CAPABILITY__AUDIT_CONTROL, "audit_control")
-   S_(SECCLASS_PASSWD, PASSWD__PASSWD, "passwd")
-   S_(SECCLASS_PASSWD, PASSWD__CHFN, "chfn")
-   S_(SECCLASS_PASSWD, PASSWD__CHSH, "chsh")
-   S_(SECCLASS_PASSWD, PASSWD__ROOTOK, "rootok")
-   S_(SECCLASS_PASSWD, PASSWD__CRONTAB, "crontab")
-   S_(SECCLASS_DRAWABLE, DRAWABLE__CREATE, "create")
-   S_(SECCLASS_DRAWABLE, DRAWABLE__DESTROY, "destroy")
-   S_(SECCLASS_DRAWABLE, DRAWABLE__DRAW, "draw")
-   S_(SECCLASS_DRAWABLE, DRAWABLE__COPY, "copy")
-   S_(SECCLASS_DRAWABLE, DRAWABLE__GETATTR, "getattr")
-   S_(SECCLASS_GC, GC__CREATE, "create")
-   S_(SECCLASS_GC, GC__FREE, "free")
-   S_(SECCLASS_GC, GC__GETATTR, "getattr")
-   S_(SECCLASS_GC, GC__SETATTR, "setattr")
-   S_(SECCLASS_WINDOW, WINDOW__ADDCHILD, "addchild")
-   S_(SECCLASS_WINDOW, WINDOW__CREATE, "create")
-   S_(SECCLASS_WINDOW, WINDOW__DESTROY, "destroy")
-   S_(SECCLASS_WINDOW, WINDOW__MAP, "map")
-   S_(SECCLASS_WINDOW, WINDOW__UNMAP, "unmap")
-   S_(SECCLASS_WINDOW, WINDOW__CHSTACK, "chstack")
-   S_(SECCLASS_WINDOW, WINDOW__CHPROPLIST, "chproplist")
-   S_(SECCLASS_WINDOW, WINDOW__CHPROP, "chprop")
-   S_(SECCLASS_WINDOW, WINDOW__LISTPROP, "listprop")
-   S_(SECCLASS_WINDOW, WINDOW__GETATTR, "getattr")
-   S_(SECCLASS_WINDOW, WINDOW__SETATTR, "setattr")
-   S_(SECCLASS_WINDOW, WINDOW__SETFOCUS, "setfocus")
-   S_(SECCLASS_WINDOW, WINDOW__MOVE, "move")
-   S_(SECCLASS_WINDOW, WINDOW__CHSELECTION, "chselection")
-   S_(SECCLASS_WINDOW, WINDOW__CHPARENT, "chparent")
-   S_(SECCLASS_WINDOW, WINDOW__CTRLLIFE, "ctrllife")
-   S_(SECCLASS_WINDOW, WINDOW__ENUMERATE, "enumerate")
-   S_(SECCLASS_WINDOW, WINDOW__TRANSPARENT, "transparent")
-   S_(SECCLASS_WINDOW, WINDOW__MOUSEMOTION, "mousemotion")
-   S_(SECCLASS_WINDOW, WINDOW__CLIENTCOMEVENT, "clientcomevent")
-   S_(SECCLASS_WINDOW, WINDOW__INPUTEVENT, "inputevent")
-   S_(SECCLASS_WINDOW, WINDOW__DRAWEVENT, "drawevent")
-   S_(SECCLASS_WINDOW, WINDOW__WINDOWCHANGEEVENT, "windowchangeevent")
-   S_(SECCLASS_WINDOW, WINDOW__WINDOWCHANGEREQUEST, "windowchangerequest")
-   S_(SECCLASS_WINDOW, WINDOW__SERVERCHANGEEVENT, "serverchangeevent")
-   S_(SECCLASS_WINDOW, WINDOW__EXTENSIONEVENT, "extensionevent")
-   S_(SECCLASS_FONT, FONT__LOAD, "load")
-   S_(SECCLASS_FONT, FONT__FREE, "free")
-   S_(SECCLASS_FONT, FONT__GETATTR, "getattr")
-   S_(SECCLASS_FONT, FONT__USE, "use")
-   S_(SECCLASS_COLORMAP, COLORMAP__CREATE, "create")
-   S_(SECCLASS_COLORMAP, COLORMAP__FREE, "free")
-   S_(SECCLASS_COLORMAP, COLORMAP__INSTALL, "install")
-   S_(SECCLASS_COLORMAP, COLORMAP__UNINSTALL, "uninstall")
-   S_(SECCLASS_COLORMAP, COLORMAP__LIST, "list")
-   S_(SECCLASS_COLORMAP, COLORMAP__READ, "read")
-   S_(SECCLASS_COLORMAP, COLORMAP__STORE, "store")
-   S_(SECCLASS_COLORMAP, COLORMAP__GETATTR, "getattr")
-   S_(SECCLASS_COLORMAP, COLORMAP__SETATTR, "setattr")
-   S_(SECCLASS_PROPERTY, PROPERTY__CREATE, "create")
-   S_(SECCLASS_PROPERTY, PROPERTY__FREE, "free")
-   S_(SECCLASS_PROPERTY, PROPERTY__READ, "read")
-   S_(SECCLASS_PROPERTY, PROPERTY__WRITE, "write")
-   S_(SECCLASS_CURSOR, CURSOR__CREATE, "create")
-   S_(SECCLASS_CURSOR, CURSOR__CREATEGLYPH, "createglyph")
-   S_(SECCLASS_CURSOR, CURSOR__FREE, "free")
-   S_(SECCLASS_CURSOR, CURSOR__ASSIGN, "assign")
-   S_(SECCLASS_CURSOR, CURSOR__SETATTR, "setattr")
-   S_(SECCLASS_XCLIENT, XCLIENT__KILL, "kill")
-   S_(SECCLASS_XINPUT, XINPUT__LOOKUP, "lookup")
-   S_(SECCLASS_XINPUT, XINPUT__GETATTR, "getattr")
-   S_(SECCLASS_XINPUT, XINPUT__SETATTR, "setattr")
-   S_(SECCLASS_XINPUT, XINPUT__SETFOCUS, "setfocus")
-   S_(SECCLASS_XINPUT, XINPUT__WARPPOINTER, "warppointer")
-   S_(SECCLASS_XINPUT, XINPUT__ACTIVEGRAB, "activegrab")
-   S_(SECCLASS_XINPUT, XINPUT__PASSIVEGRAB, "passivegrab")
-   S_(SECCLASS_XINPUT, XINPUT__UNGRAB, "ungrab")
-   S_(SECCLASS_XINPUT, XINPUT__BELL, "bell")
-   S_(SECCLASS_XINPUT, XINPUT__MOUSEMOTION, "mousemotion")
-   S_(SECCLASS_XINPUT, XINPUT__RELABELINPUT, "relabelinput")
-   S_(SECCLASS_XSERVER, XSERVER__SCREENSAVER, "screensaver")
-   S_(SECCLASS_XSERVER, XSERVER__GETHOSTLIST, "gethostlist")
-   S_(SECCLASS_XSERVER, XSERVER__SETHOSTLIST, "sethostlist")
-   S_(SECCLASS_XSERVER, XSERVER__GETFONTPATH, "getfontpath")
-   S_(SECCLASS_XSERVER, XSERVER__SETFONTPATH, "setfontpath")
-   S_(SECCLASS_XSERVER, XSERVER__GETATTR, "getattr")
-   S_(SECCLASS_XSERVER, XSERVER__GRAB, "grab")
-   S_(SECCLASS_XSERVER, XSERVER__UNGRAB, "ungrab")
-   S_(SECCLASS_XEXTENSION, XEXTENSION__QUERY, "query")
-   S_(SECCLASS_XEXTENSION, XEXTENSION__USE, "use")
-   S_(SECCLASS_PAX, PAX__PAGEEXEC, "pageexec")
-   S_(SECCLASS_PAX, PAX__EMUTRAMP, "emutramp")
-   S_(SECCLASS_PAX, PAX__MPROTECT, "mprotect")
-   S_(SECCLASS_PAX, PAX__RANDMMAP, "randmmap")
-   S_(SECCLASS_PAX, PAX__RANDEXEC, "randexec")
-   S_(SECCLASS_PAX, PAX__SEGMEXEC, "segmexec")
   S_(SECCLASS_NETLINK_ROUTE_SOCKET, NETLINK_ROUTE_SOCKET__NLMSG_READ, "nlmsg_read")
   S_(SECCLASS_NETLINK_ROUTE_SOCKET, NETLINK_ROUTE_SOCKET__NLMSG_WRITE, "nlmsg_write")
   S_(SECCLASS_NETLINK_FIREWALL_SOCKET, NETLINK_FIREWALL_SOCKET__NLMSG_READ, "nlmsg_read")
@@ -232,16 +142,6 @@
   S_(SECCLASS_NETLINK_AUDIT_SOCKET, NETLINK_AUDIT_SOCKET__NLMSG_READPRIV, "nlmsg_readpriv")
   S_(SECCLASS_NETLINK_IP6FW_SOCKET, NETLINK_IP6FW_SOCKET__NLMSG_READ, "nlmsg_read")
   S_(SECCLASS_NETLINK_IP6FW_SOCKET, NETLINK_IP6FW_SOCKET__NLMSG_WRITE, "nlmsg_write")
-   S_(SECCLASS_DBUS, DBUS__ACQUIRE_SVC, "acquire_svc")
-   S_(SECCLASS_DBUS, DBUS__SEND_MSG, "send_msg")
-   S_(SECCLASS_NSCD, NSCD__GETPWD, "getpwd")
-   S_(SECCLASS_NSCD, NSCD__GETGRP, "getgrp")
-   S_(SECCLASS_NSCD, NSCD__GETHOST, "gethost")
-   S_(SECCLASS_NSCD, NSCD__GETSTAT, "getstat")
-   S_(SECCLASS_NSCD, NSCD__ADMIN, "admin")
-   S_(SECCLASS_NSCD, NSCD__SHMEMPWD, "shmempwd")
-   S_(SECCLASS_NSCD, NSCD__SHMEMGRP, "shmemgrp")
-   S_(SECCLASS_NSCD, NSCD__SHMEMHOST, "shmemhost")
   S_(SECCLASS_ASSOCIATION, ASSOCIATION__SENDTO, "sendto")
   S_(SECCLASS_ASSOCIATION, ASSOCIATION__RECVFROM, "recvfrom")
   S_(SECCLASS_ASSOCIATION, ASSOCIATION__SETCONTEXT, "setcontext")
@@ -256,7 +156,5 @@
   S_(SECCLASS_KEY, KEY__LINK, "link")
   S_(SECCLASS_KEY, KEY__SETATTR, "setattr")
   S_(SECCLASS_KEY, KEY__CREATE, "create")
-   S_(SECCLASS_CONTEXT, CONTEXT__TRANSLATE, "translate")
-   S_(SECCLASS_CONTEXT, CONTEXT__CONTAINS, "contains")
   S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NODE_BIND, "node_bind")
   S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NAME_CONNECT, "name_connect")
diff --git a/security/selinux/include/av_permissions.h b/security/selinux/include/av_permissions.h
index 2de4b5fe3aa1..5fee1735bffe 100644
--- a/security/selinux/include/av_permissions.h
+++ b/security/selinux/include/av_permissions.h
@@ -16,7 +16,6 @@
 #define COMMON_FILE__SWAPON                              0x00004000UL
 #define COMMON_FILE__QUOTAON                             0x00008000UL
 #define COMMON_FILE__MOUNTON                             0x00010000UL
 #define COMMON_SOCKET__IOCTL                             0x00000001UL
 #define COMMON_SOCKET__READ                              0x00000002UL
 #define COMMON_SOCKET__WRITE                             0x00000004UL
@@ -39,7 +38,6 @@
 #define COMMON_SOCKET__RECV_MSG                          0x00080000UL
 #define COMMON_SOCKET__SEND_MSG                          0x00100000UL
 #define COMMON_SOCKET__NAME_BIND                         0x00200000UL
 #define COMMON_IPC__CREATE                               0x00000001UL
 #define COMMON_IPC__DESTROY                              0x00000002UL
 #define COMMON_IPC__GETATTR                              0x00000004UL
@@ -49,7 +47,6 @@
 #define COMMON_IPC__ASSOCIATE                            0x00000040UL
 #define COMMON_IPC__UNIX_READ                            0x00000080UL
 #define COMMON_IPC__UNIX_WRITE                           0x00000100UL
 #define FILESYSTEM__MOUNT                         0x00000001UL
 #define FILESYSTEM__REMOUNT                       0x00000002UL
 #define FILESYSTEM__UNMOUNT                       0x00000004UL
@@ -60,7 +57,6 @@
 #define FILESYSTEM__ASSOCIATE                     0x00000080UL
 #define FILESYSTEM__QUOTAMOD                      0x00000100UL
 #define FILESYSTEM__QUOTAGET                      0x00000200UL
 #define DIR__IOCTL                                0x00000001UL
 #define DIR__READ                                 0x00000002UL
 #define DIR__WRITE                                0x00000004UL
@@ -78,13 +74,11 @@
 #define DIR__SWAPON                               0x00004000UL
 #define DIR__QUOTAON                              0x00008000UL
 #define DIR__MOUNTON                              0x00010000UL
 #define DIR__ADD_NAME                             0x00020000UL
 #define DIR__REMOVE_NAME                          0x00040000UL
 #define DIR__REPARENT                             0x00080000UL
 #define DIR__SEARCH                               0x00100000UL
 #define DIR__RMDIR                                0x00200000UL
 #define FILE__IOCTL                               0x00000001UL
 #define FILE__READ                                0x00000002UL
 #define FILE__WRITE                               0x00000004UL
@@ -102,11 +96,9 @@
 #define FILE__SWAPON                              0x00004000UL
 #define FILE__QUOTAON                             0x00008000UL
 #define FILE__MOUNTON                             0x00010000UL
 #define FILE__EXECUTE_NO_TRANS                    0x00020000UL
 #define FILE__ENTRYPOINT                          0x00040000UL
 #define FILE__EXECMOD                             0x00080000UL
 #define LNK_FILE__IOCTL                           0x00000001UL
 #define LNK_FILE__READ                            0x00000002UL
 #define LNK_FILE__WRITE                           0x00000004UL
@@ -124,7 +116,6 @@
 #define LNK_FILE__SWAPON                          0x00004000UL
 #define LNK_FILE__QUOTAON                         0x00008000UL
 #define LNK_FILE__MOUNTON                         0x00010000UL
 #define CHR_FILE__IOCTL                           0x00000001UL
 #define CHR_FILE__READ                            0x00000002UL
 #define CHR_FILE__WRITE                           0x00000004UL
@@ -142,11 +133,9 @@
 #define CHR_FILE__SWAPON                          0x00004000UL
 #define CHR_FILE__QUOTAON                         0x00008000UL
 #define CHR_FILE__MOUNTON                         0x00010000UL
 #define CHR_FILE__EXECUTE_NO_TRANS                0x00020000UL
 #define CHR_FILE__ENTRYPOINT                      0x00040000UL
 #define CHR_FILE__EXECMOD                         0x00080000UL
 #define BLK_FILE__IOCTL                           0x00000001UL
 #define BLK_FILE__READ                            0x00000002UL
 #define BLK_FILE__WRITE                           0x00000004UL
@@ -164,7 +153,6 @@
 #define BLK_FILE__SWAPON                          0x00004000UL
 #define BLK_FILE__QUOTAON                         0x00008000UL
 #define BLK_FILE__MOUNTON                         0x00010000UL
 #define SOCK_FILE__IOCTL                          0x00000001UL
 #define SOCK_FILE__READ                           0x00000002UL
 #define SOCK_FILE__WRITE                          0x00000004UL
@@ -182,7 +170,6 @@
 #define SOCK_FILE__SWAPON                         0x00004000UL
 #define SOCK_FILE__QUOTAON                        0x00008000UL
 #define SOCK_FILE__MOUNTON                        0x00010000UL
 #define FIFO_FILE__IOCTL                          0x00000001UL
 #define FIFO_FILE__READ                           0x00000002UL
 #define FIFO_FILE__WRITE                          0x00000004UL
@@ -200,9 +187,7 @@
 #define FIFO_FILE__SWAPON                         0x00004000UL
 #define FIFO_FILE__QUOTAON                        0x00008000UL
 #define FIFO_FILE__MOUNTON                        0x00010000UL
 #define FD__USE                                   0x00000001UL
 #define SOCKET__IOCTL                             0x00000001UL
 #define SOCKET__READ                              0x00000002UL
 #define SOCKET__WRITE                             0x00000004UL
@@ -225,7 +210,6 @@
 #define SOCKET__RECV_MSG                          0x00080000UL
 #define SOCKET__SEND_MSG                          0x00100000UL
 #define SOCKET__NAME_BIND                         0x00200000UL
 #define TCP_SOCKET__IOCTL                         0x00000001UL
 #define TCP_SOCKET__READ                          0x00000002UL
 #define TCP_SOCKET__WRITE                         0x00000004UL
@@ -248,13 +232,11 @@
 #define TCP_SOCKET__RECV_MSG                      0x00080000UL
 #define TCP_SOCKET__SEND_MSG                      0x00100000UL
 #define TCP_SOCKET__NAME_BIND                     0x00200000UL
 #define TCP_SOCKET__CONNECTTO                     0x00400000UL
 #define TCP_SOCKET__NEWCONN                       0x00800000UL
 #define TCP_SOCKET__ACCEPTFROM                    0x01000000UL
 #define TCP_SOCKET__NODE_BIND                     0x02000000UL
 #define TCP_SOCKET__NAME_CONNECT                  0x04000000UL
 #define UDP_SOCKET__IOCTL                         0x00000001UL
 #define UDP_SOCKET__READ                          0x00000002UL
 #define UDP_SOCKET__WRITE                         0x00000004UL
@@ -277,9 +259,7 @@
 #define UDP_SOCKET__RECV_MSG                      0x00080000UL
 #define UDP_SOCKET__SEND_MSG                      0x00100000UL
 #define UDP_SOCKET__NAME_BIND                     0x00200000UL
 #define UDP_SOCKET__NODE_BIND                     0x00400000UL
 #define RAWIP_SOCKET__IOCTL                       0x00000001UL
 #define RAWIP_SOCKET__READ                        0x00000002UL
 #define RAWIP_SOCKET__WRITE                       0x00000004UL
@@ -302,9 +282,7 @@
 #define RAWIP_SOCKET__RECV_MSG                    0x00080000UL
 #define RAWIP_SOCKET__SEND_MSG                    0x00100000UL
 #define RAWIP_SOCKET__NAME_BIND                   0x00200000UL
 #define RAWIP_SOCKET__NODE_BIND                   0x00400000UL
 #define NODE__TCP_RECV                            0x00000001UL
 #define NODE__TCP_SEND                            0x00000002UL
 #define NODE__UDP_RECV                            0x00000004UL
@@ -314,7 +292,6 @@
 #define NODE__ENFORCE_DEST                        0x00000040UL
 #define NODE__DCCP_RECV                           0x00000080UL
 #define NODE__DCCP_SEND                           0x00000100UL
 #define NETIF__TCP_RECV                           0x00000001UL
 #define NETIF__TCP_SEND                           0x00000002UL
 #define NETIF__UDP_RECV                           0x00000004UL
@@ -323,7 +300,6 @@
 #define NETIF__RAWIP_SEND                         0x00000020UL
 #define NETIF__DCCP_RECV                          0x00000040UL
 #define NETIF__DCCP_SEND                          0x00000080UL
 #define NETLINK_SOCKET__IOCTL                     0x00000001UL
 #define NETLINK_SOCKET__READ                      0x00000002UL
 #define NETLINK_SOCKET__WRITE                     0x00000004UL
@@ -346,7 +322,6 @@
 #define NETLINK_SOCKET__RECV_MSG                  0x00080000UL
 #define NETLINK_SOCKET__SEND_MSG                  0x00100000UL
 #define NETLINK_SOCKET__NAME_BIND                 0x00200000UL
 #define PACKET_SOCKET__IOCTL                      0x00000001UL
 #define PACKET_SOCKET__READ                       0x00000002UL
 #define PACKET_SOCKET__WRITE                      0x00000004UL
@@ -369,7 +344,6 @@
 #define PACKET_SOCKET__RECV_MSG                   0x00080000UL
 #define PACKET_SOCKET__SEND_MSG                   0x00100000UL
 #define PACKET_SOCKET__NAME_BIND                  0x00200000UL
 #define KEY_SOCKET__IOCTL                         0x00000001UL
 #define KEY_SOCKET__READ                          0x00000002UL
 #define KEY_SOCKET__WRITE                         0x00000004UL
@@ -392,7 +366,6 @@
 #define KEY_SOCKET__RECV_MSG                      0x00080000UL
 #define KEY_SOCKET__SEND_MSG                      0x00100000UL
 #define KEY_SOCKET__NAME_BIND                     0x00200000UL
 #define UNIX_STREAM_SOCKET__IOCTL                 0x00000001UL
 #define UNIX_STREAM_SOCKET__READ                  0x00000002UL
 #define UNIX_STREAM_SOCKET__WRITE                 0x00000004UL
@@ -415,11 +388,9 @@
 #define UNIX_STREAM_SOCKET__RECV_MSG              0x00080000UL
 #define UNIX_STREAM_SOCKET__SEND_MSG              0x00100000UL
 #define UNIX_STREAM_SOCKET__NAME_BIND             0x00200000UL
 #define UNIX_STREAM_SOCKET__CONNECTTO             0x00400000UL
 #define UNIX_STREAM_SOCKET__NEWCONN               0x00800000UL
 #define UNIX_STREAM_SOCKET__ACCEPTFROM            0x01000000UL
 #define UNIX_DGRAM_SOCKET__IOCTL                  0x00000001UL
 #define UNIX_DGRAM_SOCKET__READ                   0x00000002UL
 #define UNIX_DGRAM_SOCKET__WRITE                  0x00000004UL
@@ -442,7 +413,6 @@
 #define UNIX_DGRAM_SOCKET__RECV_MSG               0x00080000UL
 #define UNIX_DGRAM_SOCKET__SEND_MSG               0x00100000UL
 #define UNIX_DGRAM_SOCKET__NAME_BIND              0x00200000UL
 #define PROCESS__FORK                             0x00000001UL
 #define PROCESS__TRANSITION                       0x00000002UL
 #define PROCESS__SIGCHLD                          0x00000004UL
@@ -473,7 +443,6 @@
 #define PROCESS__EXECHEAP                         0x08000000UL
 #define PROCESS__SETKEYCREATE                     0x10000000UL
 #define PROCESS__SETSOCKCREATE                    0x20000000UL
 #define IPC__CREATE                               0x00000001UL
 #define IPC__DESTROY                              0x00000002UL
 #define IPC__GETATTR                              0x00000004UL
@@ -483,7 +452,6 @@
 #define IPC__ASSOCIATE                            0x00000040UL
 #define IPC__UNIX_READ                            0x00000080UL
 #define IPC__UNIX_WRITE                           0x00000100UL
 #define SEM__CREATE                               0x00000001UL
 #define SEM__DESTROY                              0x00000002UL
 #define SEM__GETATTR                              0x00000004UL
@@ -493,7 +461,6 @@
 #define SEM__ASSOCIATE                            0x00000040UL
 #define SEM__UNIX_READ                            0x00000080UL
 #define SEM__UNIX_WRITE                           0x00000100UL
 #define MSGQ__CREATE                              0x00000001UL
 #define MSGQ__DESTROY                             0x00000002UL
 #define MSGQ__GETATTR                             0x00000004UL
@@ -503,12 +470,9 @@
 #define MSGQ__ASSOCIATE                           0x00000040UL
 #define MSGQ__UNIX_READ                           0x00000080UL
 #define MSGQ__UNIX_WRITE                          0x00000100UL
 #define MSGQ__ENQUEUE                             0x00000200UL
 #define MSG__SEND                                 0x00000001UL
 #define MSG__RECEIVE                              0x00000002UL
 #define SHM__CREATE                               0x00000001UL
 #define SHM__DESTROY                              0x00000002UL
 #define SHM__GETATTR                              0x00000004UL
@@ -518,9 +482,7 @@
 #define SHM__ASSOCIATE                            0x00000040UL
 #define SHM__UNIX_READ                            0x00000080UL
 #define SHM__UNIX_WRITE                           0x00000100UL
 #define SHM__LOCK                                 0x00000200UL
 #define SECURITY__COMPUTE_AV                      0x00000001UL
 #define SECURITY__COMPUTE_CREATE                  0x00000002UL
 #define SECURITY__COMPUTE_MEMBER                  0x00000004UL
@@ -532,12 +494,10 @@
 #define SECURITY__SETBOOL                         0x00000100UL
 #define SECURITY__SETSECPARAM                     0x00000200UL
 #define SECURITY__SETCHECKREQPROT                 0x00000400UL
 #define SYSTEM__IPC_INFO                          0x00000001UL
 #define SYSTEM__SYSLOG_READ                       0x00000002UL
 #define SYSTEM__SYSLOG_MOD                        0x00000004UL
 #define SYSTEM__SYSLOG_CONSOLE                    0x00000008UL
 #define CAPABILITY__CHOWN                         0x00000001UL
 #define CAPABILITY__DAC_OVERRIDE                  0x00000002UL
 #define CAPABILITY__DAC_READ_SEARCH               0x00000004UL
@@ -569,110 +529,6 @@
 #define CAPABILITY__LEASE                         0x10000000UL
 #define CAPABILITY__AUDIT_WRITE                   0x20000000UL
 #define CAPABILITY__AUDIT_CONTROL                 0x40000000UL
-#define PASSWD__PASSWD                            0x00000001UL
-#define PASSWD__CHFN                              0x00000002UL
-#define PASSWD__CHSH                              0x00000004UL
-#define PASSWD__ROOTOK                            0x00000008UL
-#define PASSWD__CRONTAB                           0x00000010UL
-#define DRAWABLE__CREATE                          0x00000001UL
-#define DRAWABLE__DESTROY                         0x00000002UL
-#define DRAWABLE__DRAW                            0x00000004UL
-#define DRAWABLE__COPY                            0x00000008UL
-#define DRAWABLE__GETATTR                         0x00000010UL
-#define GC__CREATE                                0x00000001UL
-#define GC__FREE                                  0x00000002UL
-#define GC__GETATTR                               0x00000004UL
-#define GC__SETATTR                               0x00000008UL
-#define WINDOW__ADDCHILD                          0x00000001UL
-#define WINDOW__CREATE                            0x00000002UL
-#define WINDOW__DESTROY                           0x00000004UL
-#define WINDOW__MAP                               0x00000008UL
-#define WINDOW__UNMAP                             0x00000010UL
-#define WINDOW__CHSTACK                           0x00000020UL
-#define WINDOW__CHPROPLIST                        0x00000040UL
-#define WINDOW__CHPROP                            0x00000080UL
-#define WINDOW__LISTPROP                          0x00000100UL
-#define WINDOW__GETATTR                           0x00000200UL
-#define WINDOW__SETATTR                           0x00000400UL
-#define WINDOW__SETFOCUS                          0x00000800UL
-#define WINDOW__MOVE                              0x00001000UL
-#define WINDOW__CHSELECTION                       0x00002000UL
-#define WINDOW__CHPARENT                          0x00004000UL
-#define WINDOW__CTRLLIFE                          0x00008000UL
-#define WINDOW__ENUMERATE                         0x00010000UL
-#define WINDOW__TRANSPARENT                       0x00020000UL
-#define WINDOW__MOUSEMOTION                       0x00040000UL
-#define WINDOW__CLIENTCOMEVENT                    0x00080000UL
-#define WINDOW__INPUTEVENT                        0x00100000UL
-#define WINDOW__DRAWEVENT                         0x00200000UL
-#define WINDOW__WINDOWCHANGEEVENT                 0x00400000UL
-#define WINDOW__WINDOWCHANGEREQUEST               0x00800000UL
-#define WINDOW__SERVERCHANGEEVENT                 0x01000000UL
-#define WINDOW__EXTENSIONEVENT                    0x02000000UL
-#define FONT__LOAD                                0x00000001UL
-#define FONT__FREE                                0x00000002UL
-#define FONT__GETATTR                             0x00000004UL
-#define FONT__USE                                 0x00000008UL
-#define COLORMAP__CREATE                          0x00000001UL
-#define COLORMAP__FREE                            0x00000002UL
-#define COLORMAP__INSTALL                         0x00000004UL
-#define COLORMAP__UNINSTALL                       0x00000008UL
-#define COLORMAP__LIST                            0x00000010UL
-#define COLORMAP__READ                            0x00000020UL
-#define COLORMAP__STORE                           0x00000040UL
-#define COLORMAP__GETATTR                         0x00000080UL
-#define COLORMAP__SETATTR                         0x00000100UL
-#define PROPERTY__CREATE                          0x00000001UL
-#define PROPERTY__FREE                            0x00000002UL
-#define PROPERTY__READ                            0x00000004UL
-#define PROPERTY__WRITE                           0x00000008UL
-#define CURSOR__CREATE                            0x00000001UL
-#define CURSOR__CREATEGLYPH                       0x00000002UL
-#define CURSOR__FREE                              0x00000004UL
-#define CURSOR__ASSIGN                            0x00000008UL
-#define CURSOR__SETATTR                           0x00000010UL
-#define XCLIENT__KILL                             0x00000001UL
-#define XINPUT__LOOKUP                            0x00000001UL
-#define XINPUT__GETATTR                           0x00000002UL
-#define XINPUT__SETATTR                           0x00000004UL
-#define XINPUT__SETFOCUS                          0x00000008UL
-#define XINPUT__WARPPOINTER                       0x00000010UL
-#define XINPUT__ACTIVEGRAB                        0x00000020UL
-#define XINPUT__PASSIVEGRAB                       0x00000040UL
-#define XINPUT__UNGRAB                            0x00000080UL
-#define XINPUT__BELL                              0x00000100UL
-#define XINPUT__MOUSEMOTION                       0x00000200UL
-#define XINPUT__RELABELINPUT                      0x00000400UL
-#define XSERVER__SCREENSAVER                      0x00000001UL
-#define XSERVER__GETHOSTLIST                      0x00000002UL
-#define XSERVER__SETHOSTLIST                      0x00000004UL
-#define XSERVER__GETFONTPATH                      0x00000008UL
-#define XSERVER__SETFONTPATH                      0x00000010UL
-#define XSERVER__GETATTR                          0x00000020UL
-#define XSERVER__GRAB                             0x00000040UL
-#define XSERVER__UNGRAB                           0x00000080UL
-#define XEXTENSION__QUERY                         0x00000001UL
-#define XEXTENSION__USE                           0x00000002UL
-#define PAX__PAGEEXEC                             0x00000001UL
-#define PAX__EMUTRAMP                             0x00000002UL
-#define PAX__MPROTECT                             0x00000004UL
-#define PAX__RANDMMAP                             0x00000008UL
-#define PAX__RANDEXEC                             0x00000010UL
-#define PAX__SEGMEXEC                             0x00000020UL
 #define NETLINK_ROUTE_SOCKET__IOCTL               0x00000001UL
 #define NETLINK_ROUTE_SOCKET__READ                0x00000002UL
 #define NETLINK_ROUTE_SOCKET__WRITE               0x00000004UL
@@ -695,10 +551,8 @@
 #define NETLINK_ROUTE_SOCKET__RECV_MSG            0x00080000UL
 #define NETLINK_ROUTE_SOCKET__SEND_MSG            0x00100000UL
 #define NETLINK_ROUTE_SOCKET__NAME_BIND           0x00200000UL
 #define NETLINK_ROUTE_SOCKET__NLMSG_READ          0x00400000UL
 #define NETLINK_ROUTE_SOCKET__NLMSG_WRITE         0x00800000UL
 #define NETLINK_FIREWALL_SOCKET__IOCTL            0x00000001UL
 #define NETLINK_FIREWALL_SOCKET__READ             0x00000002UL
 #define NETLINK_FIREWALL_SOCKET__WRITE            0x00000004UL
@@ -721,10 +575,8 @@
 #define NETLINK_FIREWALL_SOCKET__RECV_MSG         0x00080000UL
 #define NETLINK_FIREWALL_SOCKET__SEND_MSG         0x00100000UL
 #define NETLINK_FIREWALL_SOCKET__NAME_BIND        0x00200000UL
 #define NETLINK_FIREWALL_SOCKET__NLMSG_READ       0x00400000UL
 #define NETLINK_FIREWALL_SOCKET__NLMSG_WRITE      0x00800000UL
 #define NETLINK_TCPDIAG_SOCKET__IOCTL             0x00000001UL
 #define NETLINK_TCPDIAG_SOCKET__READ              0x00000002UL
 #define NETLINK_TCPDIAG_SOCKET__WRITE             0x00000004UL
@@ -747,10 +599,8 @@
 #define NETLINK_TCPDIAG_SOCKET__RECV_MSG          0x00080000UL
 #define NETLINK_TCPDIAG_SOCKET__SEND_MSG          0x00100000UL
 #define NETLINK_TCPDIAG_SOCKET__NAME_BIND         0x00200000UL
 #define NETLINK_TCPDIAG_SOCKET__NLMSG_READ        0x00400000UL
 #define NETLINK_TCPDIAG_SOCKET__NLMSG_WRITE       0x00800000UL
 #define NETLINK_NFLOG_SOCKET__IOCTL               0x00000001UL
 #define NETLINK_NFLOG_SOCKET__READ                0x00000002UL
 #define NETLINK_NFLOG_SOCKET__WRITE               0x00000004UL
@@ -773,7 +623,6 @@
 #define NETLINK_NFLOG_SOCKET__RECV_MSG            0x00080000UL
 #define NETLINK_NFLOG_SOCKET__SEND_MSG            0x00100000UL
 #define NETLINK_NFLOG_SOCKET__NAME_BIND           0x00200000UL
 #define NETLINK_XFRM_SOCKET__IOCTL                0x00000001UL
 #define NETLINK_XFRM_SOCKET__READ                 0x00000002UL
 #define NETLINK_XFRM_SOCKET__WRITE                0x00000004UL
@@ -796,10 +645,8 @@
 #define NETLINK_XFRM_SOCKET__RECV_MSG             0x00080000UL
 #define NETLINK_XFRM_SOCKET__SEND_MSG             0x00100000UL
 #define NETLINK_XFRM_SOCKET__NAME_BIND            0x00200000UL
 #define NETLINK_XFRM_SOCKET__NLMSG_READ           0x00400000UL
 #define NETLINK_XFRM_SOCKET__NLMSG_WRITE          0x00800000UL
 #define NETLINK_SELINUX_SOCKET__IOCTL             0x00000001UL
 #define NETLINK_SELINUX_SOCKET__READ              0x00000002UL
 #define NETLINK_SELINUX_SOCKET__WRITE             0x00000004UL
@@ -822,7 +669,6 @@
 #define NETLINK_SELINUX_SOCKET__RECV_MSG          0x00080000UL
 #define NETLINK_SELINUX_SOCKET__SEND_MSG          0x00100000UL
 #define NETLINK_SELINUX_SOCKET__NAME_BIND         0x00200000UL
 #define NETLINK_AUDIT_SOCKET__IOCTL               0x00000001UL
 #define NETLINK_AUDIT_SOCKET__READ                0x00000002UL
 #define NETLINK_AUDIT_SOCKET__WRITE               0x00000004UL
@@ -845,12 +691,10 @@
 #define NETLINK_AUDIT_SOCKET__RECV_MSG            0x00080000UL
 #define NETLINK_AUDIT_SOCKET__SEND_MSG            0x00100000UL
 #define NETLINK_AUDIT_SOCKET__NAME_BIND           0x00200000UL
 #define NETLINK_AUDIT_SOCKET__NLMSG_READ          0x00400000UL
 #define NETLINK_AUDIT_SOCKET__NLMSG_WRITE         0x00800000UL
 #define NETLINK_AUDIT_SOCKET__NLMSG_RELAY         0x01000000UL
 #define NETLINK_AUDIT_SOCKET__NLMSG_READPRIV      0x02000000UL
 #define NETLINK_IP6FW_SOCKET__IOCTL               0x00000001UL
 #define NETLINK_IP6FW_SOCKET__READ                0x00000002UL
 #define NETLINK_IP6FW_SOCKET__WRITE               0x00000004UL
@@ -873,10 +717,8 @@
 #define NETLINK_IP6FW_SOCKET__RECV_MSG            0x00080000UL
 #define NETLINK_IP6FW_SOCKET__SEND_MSG            0x00100000UL
 #define NETLINK_IP6FW_SOCKET__NAME_BIND           0x00200000UL
 #define NETLINK_IP6FW_SOCKET__NLMSG_READ          0x00400000UL
 #define NETLINK_IP6FW_SOCKET__NLMSG_WRITE         0x00800000UL
 #define NETLINK_DNRT_SOCKET__IOCTL                0x00000001UL
 #define NETLINK_DNRT_SOCKET__READ                 0x00000002UL
 #define NETLINK_DNRT_SOCKET__WRITE                0x00000004UL
@@ -899,24 +741,10 @@
 #define NETLINK_DNRT_SOCKET__RECV_MSG             0x00080000UL
 #define NETLINK_DNRT_SOCKET__SEND_MSG             0x00100000UL
 #define NETLINK_DNRT_SOCKET__NAME_BIND            0x00200000UL
-#define DBUS__ACQUIRE_SVC                         0x00000001UL
-#define DBUS__SEND_MSG                            0x00000002UL
-#define NSCD__GETPWD                              0x00000001UL
-#define NSCD__GETGRP                              0x00000002UL
-#define NSCD__GETHOST                             0x00000004UL
-#define NSCD__GETSTAT                             0x00000008UL
-#define NSCD__ADMIN                               0x00000010UL
-#define NSCD__SHMEMPWD                            0x00000020UL
-#define NSCD__SHMEMGRP                            0x00000040UL
-#define NSCD__SHMEMHOST                           0x00000080UL
 #define ASSOCIATION__SENDTO                       0x00000001UL
 #define ASSOCIATION__RECVFROM                     0x00000002UL
 #define ASSOCIATION__SETCONTEXT                   0x00000004UL
 #define ASSOCIATION__POLMATCH                     0x00000008UL
 #define NETLINK_KOBJECT_UEVENT_SOCKET__IOCTL      0x00000001UL
 #define NETLINK_KOBJECT_UEVENT_SOCKET__READ       0x00000002UL
 #define NETLINK_KOBJECT_UEVENT_SOCKET__WRITE      0x00000004UL
@@ -939,7 +767,6 @@
 #define NETLINK_KOBJECT_UEVENT_SOCKET__RECV_MSG   0x00080000UL
 #define NETLINK_KOBJECT_UEVENT_SOCKET__SEND_MSG   0x00100000UL
 #define NETLINK_KOBJECT_UEVENT_SOCKET__NAME_BIND  0x00200000UL
 #define APPLETALK_SOCKET__IOCTL                   0x00000001UL
 #define APPLETALK_SOCKET__READ                    0x00000002UL
 #define APPLETALK_SOCKET__WRITE                   0x00000004UL
@@ -962,11 +789,9 @@
 #define APPLETALK_SOCKET__RECV_MSG                0x00080000UL
 #define APPLETALK_SOCKET__SEND_MSG                0x00100000UL
 #define APPLETALK_SOCKET__NAME_BIND               0x00200000UL
 #define PACKET__SEND                              0x00000001UL
 #define PACKET__RECV                              0x00000002UL
 #define PACKET__RELABELTO                         0x00000004UL
 #define KEY__VIEW                                 0x00000001UL
 #define KEY__READ                                 0x00000002UL
 #define KEY__WRITE                                0x00000004UL
@@ -974,10 +799,6 @@
 #define KEY__LINK                                 0x00000010UL
 #define KEY__SETATTR                              0x00000020UL
 #define KEY__CREATE                               0x00000040UL
-#define CONTEXT__TRANSLATE                        0x00000001UL
-#define CONTEXT__CONTAINS                         0x00000002UL
 #define DCCP_SOCKET__IOCTL                        0x00000001UL
 #define DCCP_SOCKET__READ                         0x00000002UL
 #define DCCP_SOCKET__WRITE                        0x00000004UL
diff --git a/security/selinux/include/class_to_string.h b/security/selinux/include/class_to_string.h
index 9f3ebb1bfae6..378799068441 100644
--- a/security/selinux/include/class_to_string.h
+++ b/security/selinux/include/class_to_string.h
@@ -2,7 +2,7 @@
 /*
 * Security object class definitions
 */
-    S_("null")
+    S_(NULL)
    S_("security")
    S_("process")
    S_("system")
@@ -32,19 +32,19 @@
    S_("msgq")
    S_("shm")
    S_("ipc")
-    S_("passwd")
+    S_(NULL)
-    S_("drawable")
+    S_(NULL)
-    S_("window")
+    S_(NULL)
-    S_("gc")
+    S_(NULL)
-    S_("font")
+    S_(NULL)
-    S_("colormap")
+    S_(NULL)
-    S_("property")
+    S_(NULL)
-    S_("cursor")
+    S_(NULL)
-    S_("xclient")
+    S_(NULL)
-    S_("xinput")
+    S_(NULL)
-    S_("xserver")
+    S_(NULL)
-    S_("xextension")
+    S_(NULL)
-    S_("pax")
+    S_(NULL)
    S_("netlink_route_socket")
    S_("netlink_firewall_socket")
    S_("netlink_tcpdiag_socket")
@@ -54,12 +54,12 @@
    S_("netlink_audit_socket")
    S_("netlink_ip6fw_socket")
    S_("netlink_dnrt_socket")
-    S_("dbus")
+    S_(NULL)
-    S_("nscd")
+    S_(NULL)
    S_("association")
    S_("netlink_kobject_uevent_socket")
    S_("appletalk_socket")
    S_("packet")
    S_("key")
-    S_("context")
+    S_(NULL)
    S_("dccp_socket")
diff --git a/security/selinux/include/flask.h b/security/selinux/include/flask.h
index 67cef371ee00..35f309f47873 100644
--- a/security/selinux/include/flask.h
+++ b/security/selinux/include/flask.h
@@ -34,19 +34,6 @@
 #define SECCLASS_MSGQ                                    27
 #define SECCLASS_SHM                                     28
 #define SECCLASS_IPC                                     29
-#define SECCLASS_PASSWD                                  30
-#define SECCLASS_DRAWABLE                                31
-#define SECCLASS_WINDOW                                  32
-#define SECCLASS_GC                                      33
-#define SECCLASS_FONT                                    34
-#define SECCLASS_COLORMAP                                35
-#define SECCLASS_PROPERTY                                36
-#define SECCLASS_CURSOR                                  37
-#define SECCLASS_XCLIENT                                 38
-#define SECCLASS_XINPUT                                  39
-#define SECCLASS_XSERVER                                 40
-#define SECCLASS_XEXTENSION                              41
-#define SECCLASS_PAX                                     42
 #define SECCLASS_NETLINK_ROUTE_SOCKET                    43
 #define SECCLASS_NETLINK_FIREWALL_SOCKET                 44
 #define SECCLASS_NETLINK_TCPDIAG_SOCKET                  45
@@ -56,14 +43,11 @@
 #define SECCLASS_NETLINK_AUDIT_SOCKET                    49
 #define SECCLASS_NETLINK_IP6FW_SOCKET                    50
 #define SECCLASS_NETLINK_DNRT_SOCKET                     51
-#define SECCLASS_DBUS                                    52
-#define SECCLASS_NSCD                                    53
 #define SECCLASS_ASSOCIATION                             54
 #define SECCLASS_NETLINK_KOBJECT_UEVENT_SOCKET           55
 #define SECCLASS_APPLETALK_SOCKET                        56
 #define SECCLASS_PACKET                                  57
 #define SECCLASS_KEY                                     58
-#define SECCLASS_CONTEXT                                 59
 #define SECCLASS_DCCP_SOCKET                             60
 /*
diff --git a/security/selinux/include/selinux_netlabel.h b/security/selinux/include/netlabel.h
index 2a732c9033e3..218e3f77c350 100644
--- a/security/selinux/include/selinux_netlabel.h
+++ b/security/selinux/include/netlabel.h
@@ -38,19 +38,22 @@
 #ifdef CONFIG_NETLABEL
 void selinux_netlbl_cache_invalidate(void);
-int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid);
-int selinux_netlbl_socket_post_create(struct socket *sock);
-void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock);
-int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
-                                struct sk_buff *skb,
-                                struct avc_audit_data *ad);
 void selinux_netlbl_sk_security_reset(struct sk_security_struct *ssec,
                                      int family);
 void selinux_netlbl_sk_security_init(struct sk_security_struct *ssec,
                                     int family);
 void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec,
                                      struct sk_security_struct *newssec);
+int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid);
+void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock);
+int selinux_netlbl_socket_post_create(struct socket *sock);
 int selinux_netlbl_inode_permission(struct inode *inode, int mask);
+int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
+                                struct sk_buff *skb,
+                                struct avc_audit_data *ad);
 int selinux_netlbl_socket_setsockopt(struct socket *sock,
                                     int level,
                                     int optname);
@@ -60,59 +63,53 @@ static inline void selinux_netlbl_cache_invalidate(void)
        return;
 }
-static inline int selinux_netlbl_skbuff_getsid(struct sk_buff *skb,
+static inline void selinux_netlbl_sk_security_reset(
-                                               u32 base_sid,
+                                               struct sk_security_struct *ssec,
-                                               u32 *sid)
+                                               int family)
 {
-        *sid = SECSID_NULL;
+        return;
-        return 0;
 }
+static inline void selinux_netlbl_sk_security_init(
-static inline int selinux_netlbl_socket_post_create(struct socket *sock)
+                                               struct sk_security_struct *ssec,
+                                               int family)
 {
-        return 0;
+        return;
 }
+static inline void selinux_netlbl_sk_security_clone(
-static inline void selinux_netlbl_sock_graft(struct sock *sk,
+                                            struct sk_security_struct *ssec,
-                                             struct socket *sock)
+                                            struct sk_security_struct *newssec)
 {
        return;
 }
-static inline int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
+static inline int selinux_netlbl_skbuff_getsid(struct sk_buff *skb,
-                                              struct sk_buff *skb,
+                                               u32 base_sid,
-                                              struct avc_audit_data *ad)
+                                               u32 *sid)
 {
+        *sid = SECSID_NULL;
        return 0;
 }
-static inline void selinux_netlbl_sk_security_reset(
+static inline void selinux_netlbl_sock_graft(struct sock *sk,
-                                               struct sk_security_struct *ssec,
+                                             struct socket *sock)
-                                               int family)
-{
-        return;
-}
-static inline void selinux_netlbl_sk_security_init(
-                                               struct sk_security_struct *ssec,
-                                               int family)
 {
        return;
 }
+static inline int selinux_netlbl_socket_post_create(struct socket *sock)
-static inline void selinux_netlbl_sk_security_clone(
-                                           struct sk_security_struct *ssec,
-                                           struct sk_security_struct *newssec)
 {
-        return;
+        return 0;
 }
 static inline int selinux_netlbl_inode_permission(struct inode *inode,
                                                  int mask)
 {
        return 0;
 }
+static inline int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
+                                              struct sk_buff *skb,
+                                              struct avc_audit_data *ad)
+{
+        return 0;
+}
 static inline int selinux_netlbl_socket_setsockopt(struct socket *sock,
                                                   int level,
                                                   int optname)
diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h
index 210eec77e7ff..b94378afea25 100644
--- a/security/selinux/include/security.h
+++ b/security/selinux/include/security.h
@@ -34,7 +34,7 @@
 #define POLICYDB_VERSION_MAX    POLICYDB_VERSION_RANGETRANS
 #endif
-struct sk_buff;
+struct netlbl_lsm_secattr;
 extern int selinux_enabled;
 extern int selinux_mls_enabled;
@@ -82,8 +82,6 @@ int security_netif_sid(char *name, u32 *if_sid,
 int security_node_sid(u16 domain, void *addr, u32 addrlen,
        u32 *out_sid);
-void security_skb_extlbl_sid(struct sk_buff *skb, u32 base_sid, u32 *sid);
 int security_validate_transition(u32 oldsid, u32 newsid, u32 tasksid,
                                 u16 tclass);
@@ -102,5 +100,30 @@ int security_fs_use(const char *fstype, unsigned int *behavior,
 int security_genfs_sid(const char *fstype, char *name, u16 sclass,
        u32 *sid);
+#ifdef CONFIG_NETLABEL
+int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr,
+                                   u32 base_sid,
+                                   u32 *sid);
+int security_netlbl_sid_to_secattr(u32 sid,
+                                   struct netlbl_lsm_secattr *secattr);
+#else
+static inline int security_netlbl_secattr_to_sid(
+                                            struct netlbl_lsm_secattr *secattr,
+                                            u32 base_sid,
+                                            u32 *sid)
+{
+        return -EIDRM;
+}
+static inline int security_netlbl_sid_to_secattr(u32 sid,
+                                           struct netlbl_lsm_secattr *secattr)
+{
+        return -ENOENT;
+}
+#endif /* CONFIG_NETLABEL */
+const char *security_get_initial_sid_context(u32 sid);
 #endif /* _SELINUX_SECURITY_H_ */
diff --git a/security/selinux/netlabel.c b/security/selinux/netlabel.c
new file mode 100644
index 000000000000..bf8750791dd1
--- /dev/null
+++ b/security/selinux/netlabel.c
@@ -0,0 +1,363 @@
+/*
+ * SELinux NetLabel Support
+ *
+ * This file provides the necessary glue to tie NetLabel into the SELinux
+ * subsystem.
+ *
+ * Author: Paul Moore <paul.moore@hp.com>
+ *
+ */
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2007
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program;  if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+#include <net/sock.h>
+#include <net/netlabel.h>
+#include "objsec.h"
+#include "security.h"
+/**
+ * selinux_netlbl_socket_setsid - Label a socket using the NetLabel mechanism
+ * @sock: the socket to label
+ * @sid: the SID to use
+ *
+ * Description:
+ * Attempt to label a socket using the NetLabel mechanism using the given
+ * SID.  Returns zero values on success, negative values on failure.  The
+ * caller is responsibile for calling rcu_read_lock() before calling this
+ * this function and rcu_read_unlock() after this function returns.
+ *
+ */
+static int selinux_netlbl_socket_setsid(struct socket *sock, u32 sid)
+{
+        int rc;
+        struct sk_security_struct *sksec = sock->sk->sk_security;
+        struct netlbl_lsm_secattr secattr;
+        rc = security_netlbl_sid_to_secattr(sid, &secattr);
+        if (rc != 0)
+                return rc;
+        rc = netlbl_socket_setattr(sock, &secattr);
+        if (rc == 0) {
+                spin_lock_bh(&sksec->nlbl_lock);
+                sksec->nlbl_state = NLBL_LABELED;
+                spin_unlock_bh(&sksec->nlbl_lock);
+        }
+        return rc;
+}
+/**
+ * selinux_netlbl_cache_invalidate - Invalidate the NetLabel cache
+ *
+ * Description:
+ * Invalidate the NetLabel security attribute mapping cache.
+ *
+ */
+void selinux_netlbl_cache_invalidate(void)
+{
+        netlbl_cache_invalidate();
+}
+/**
+ * selinux_netlbl_sk_security_reset - Reset the NetLabel fields
+ * @ssec: the sk_security_struct
+ * @family: the socket family
+ *
+ * Description:
+ * Called when the NetLabel state of a sk_security_struct needs to be reset.
+ * The caller is responsibile for all the NetLabel sk_security_struct locking.
+ *
+ */
+void selinux_netlbl_sk_security_reset(struct sk_security_struct *ssec,
+                                      int family)
+{
+        if (family == PF_INET)
+                ssec->nlbl_state = NLBL_REQUIRE;
+        else
+                ssec->nlbl_state = NLBL_UNSET;
+}
+/**
+ * selinux_netlbl_sk_security_init - Setup the NetLabel fields
+ * @ssec: the sk_security_struct
+ * @family: the socket family
+ *
+ * Description:
+ * Called when a new sk_security_struct is allocated to initialize the NetLabel
+ * fields.
+ *
+ */
+void selinux_netlbl_sk_security_init(struct sk_security_struct *ssec,
+                                     int family)
+{
+        /* No locking needed, we are the only one who has access to ssec */
+        selinux_netlbl_sk_security_reset(ssec, family);
+        spin_lock_init(&ssec->nlbl_lock);
+}
+/**
+ * selinux_netlbl_sk_security_clone - Copy the NetLabel fields
+ * @ssec: the original sk_security_struct
+ * @newssec: the cloned sk_security_struct
+ *
+ * Description:
+ * Clone the NetLabel specific sk_security_struct fields from @ssec to
+ * @newssec.
+ *
+ */
+void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec,
+                                      struct sk_security_struct *newssec)
+{
+        /* We don't need to take newssec->nlbl_lock because we are the only
+         * thread with access to newssec, but we do need to take the RCU read
+         * lock as other threads could have access to ssec */
+        rcu_read_lock();
+        selinux_netlbl_sk_security_reset(newssec, ssec->sk->sk_family);
+        newssec->sclass = ssec->sclass;
+        rcu_read_unlock();
+}
+/**
+ * selinux_netlbl_skbuff_getsid - Get the sid of a packet using NetLabel
+ * @skb: the packet
+ * @base_sid: the SELinux SID to use as a context for MLS only attributes
+ * @sid: the SID
+ *
+ * Description:
+ * Call the NetLabel mechanism to get the security attributes of the given
+ * packet and use those attributes to determine the correct context/SID to
+ * assign to the packet.  Returns zero on success, negative values on failure.
+ *
+ */
+int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid)
+{
+        int rc;
+        struct netlbl_lsm_secattr secattr;
+        netlbl_secattr_init(&secattr);
+        rc = netlbl_skbuff_getattr(skb, &secattr);
+        if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE)
+                rc = security_netlbl_secattr_to_sid(&secattr,
+                                                    base_sid,
+                                                    sid);
+        else
+                *sid = SECSID_NULL;
+        netlbl_secattr_destroy(&secattr);
+        return rc;
+}
+/**
+ * selinux_netlbl_sock_graft - Netlabel the new socket
+ * @sk: the new connection
+ * @sock: the new socket
+ *
+ * Description:
+ * The connection represented by @sk is being grafted onto @sock so set the
+ * socket's NetLabel to match the SID of @sk.
+ *
+ */
+void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock)
+{
+        struct inode_security_struct *isec = SOCK_INODE(sock)->i_security;
+        struct sk_security_struct *sksec = sk->sk_security;
+        struct netlbl_lsm_secattr secattr;
+        u32 nlbl_peer_sid;
+        sksec->sclass = isec->sclass;
+        rcu_read_lock();
+        if (sksec->nlbl_state != NLBL_REQUIRE) {
+                rcu_read_unlock();
+                return;
+        }
+        netlbl_secattr_init(&secattr);
+        if (netlbl_sock_getattr(sk, &secattr) == 0 &&
+            secattr.flags != NETLBL_SECATTR_NONE &&
+            security_netlbl_secattr_to_sid(&secattr,
+                                           SECINITSID_UNLABELED,
+                                           &nlbl_peer_sid) == 0)
+                sksec->peer_sid = nlbl_peer_sid;
+        netlbl_secattr_destroy(&secattr);
+        /* Try to set the NetLabel on the socket to save time later, if we fail
+         * here we will pick up the pieces in later calls to
+         * selinux_netlbl_inode_permission(). */
+        selinux_netlbl_socket_setsid(sock, sksec->sid);
+        rcu_read_unlock();
+}
+/**
+ * selinux_netlbl_socket_post_create - Label a socket using NetLabel
+ * @sock: the socket to label
+ *
+ * Description:
+ * Attempt to label a socket using the NetLabel mechanism using the given
+ * SID.  Returns zero values on success, negative values on failure.
+ *
+ */
+int selinux_netlbl_socket_post_create(struct socket *sock)
+{
+        int rc = 0;
+        struct inode_security_struct *isec = SOCK_INODE(sock)->i_security;
+        struct sk_security_struct *sksec = sock->sk->sk_security;
+        sksec->sclass = isec->sclass;
+        rcu_read_lock();
+        if (sksec->nlbl_state == NLBL_REQUIRE)
+                rc = selinux_netlbl_socket_setsid(sock, sksec->sid);
+        rcu_read_unlock();
+        return rc;
+}
+/**
+ * selinux_netlbl_inode_permission - Verify the socket is NetLabel labeled
+ * @inode: the file descriptor's inode
+ * @mask: the permission mask
+ *
+ * Description:
+ * Looks at a file's inode and if it is marked as a socket protected by
+ * NetLabel then verify that the socket has been labeled, if not try to label
+ * the socket now with the inode's SID.  Returns zero on success, negative
+ * values on failure.
+ *
+ */
+int selinux_netlbl_inode_permission(struct inode *inode, int mask)
+{
+        int rc;
+        struct sk_security_struct *sksec;
+        struct socket *sock;
+        if (!S_ISSOCK(inode->i_mode) ||
+            ((mask & (MAY_WRITE | MAY_APPEND)) == 0))
+                return 0;
+        sock = SOCKET_I(inode);
+        sksec = sock->sk->sk_security;
+        rcu_read_lock();
+        if (sksec->nlbl_state != NLBL_REQUIRE) {
+                rcu_read_unlock();
+                return 0;
+        }
+        local_bh_disable();
+        bh_lock_sock_nested(sock->sk);
+        rc = selinux_netlbl_socket_setsid(sock, sksec->sid);
+        bh_unlock_sock(sock->sk);
+        local_bh_enable();
+        rcu_read_unlock();
+        return rc;
+}
+/**
+ * selinux_netlbl_sock_rcv_skb - Do an inbound access check using NetLabel
+ * @sksec: the sock's sk_security_struct
+ * @skb: the packet
+ * @ad: the audit data
+ *
+ * Description:
+ * Fetch the NetLabel security attributes from @skb and perform an access check
+ * against the receiving socket.  Returns zero on success, negative values on
+ * error.
+ *
+ */
+int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
+                                struct sk_buff *skb,
+                                struct avc_audit_data *ad)
+{
+        int rc;
+        u32 netlbl_sid;
+        u32 recv_perm;
+        rc = selinux_netlbl_skbuff_getsid(skb,
+                                          SECINITSID_UNLABELED,
+                                          &netlbl_sid);
+        if (rc != 0)
+                return rc;
+        if (netlbl_sid == SECSID_NULL)
+                return 0;
+        switch (sksec->sclass) {
+        case SECCLASS_UDP_SOCKET:
+                recv_perm = UDP_SOCKET__RECVFROM;
+                break;
+        case SECCLASS_TCP_SOCKET:
+                recv_perm = TCP_SOCKET__RECVFROM;
+                break;
+        default:
+                recv_perm = RAWIP_SOCKET__RECVFROM;
+        }
+        rc = avc_has_perm(sksec->sid,
+                          netlbl_sid,
+                          sksec->sclass,
+                          recv_perm,
+                          ad);
+        if (rc == 0)
+                return 0;
+        netlbl_skbuff_err(skb, rc);
+        return rc;
+}
+/**
+ * selinux_netlbl_socket_setsockopt - Do not allow users to remove a NetLabel
+ * @sock: the socket
+ * @level: the socket level or protocol
+ * @optname: the socket option name
+ *
+ * Description:
+ * Check the setsockopt() call and if the user is trying to replace the IP
+ * options on a socket and a NetLabel is in place for the socket deny the
+ * access; otherwise allow the access.  Returns zero when the access is
+ * allowed, -EACCES when denied, and other negative values on error.
+ *
+ */
+int selinux_netlbl_socket_setsockopt(struct socket *sock,
+                                     int level,
+                                     int optname)
+{
+        int rc = 0;
+        struct sk_security_struct *sksec = sock->sk->sk_security;
+        struct netlbl_lsm_secattr secattr;
+        rcu_read_lock();
+        if (level == IPPROTO_IP && optname == IP_OPTIONS &&
+            sksec->nlbl_state == NLBL_LABELED) {
+                netlbl_secattr_init(&secattr);
+                rc = netlbl_socket_getattr(sock, &secattr);
+                if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE)
+                        rc = -EACCES;
+                netlbl_secattr_destroy(&secattr);
+        }
+        rcu_read_unlock();
+        return rc;
+}
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 93b3177c7585..aca099aa2ed3 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -96,12 +96,18 @@ enum sel_inos {
        SEL_COMMIT_BOOLS, /* commit new boolean values */
        SEL_MLS,        /* return if MLS policy is enabled */
        SEL_DISABLE,    /* disable SELinux until next reboot */
-        SEL_AVC,        /* AVC management directory */
        SEL_MEMBER,     /* compute polyinstantiation membership decision */
        SEL_CHECKREQPROT, /* check requested protection, not kernel-applied one */
        SEL_COMPAT_NET, /* whether to use old compat network packet controls */
+        SEL_INO_NEXT,   /* The next inode number to use */
 };
+static unsigned long sel_last_ino = SEL_INO_NEXT - 1;
+#define SEL_INITCON_INO_OFFSET  0x01000000
+#define SEL_BOOL_INO_OFFSET     0x02000000
+#define SEL_INO_MASK            0x00ffffff
 #define TMPBUFLEN       12
 static ssize_t sel_read_enforce(struct file *filp, char __user *buf,
                                size_t count, loff_t *ppos)
@@ -777,8 +783,6 @@ static struct inode *sel_make_inode(struct super_block *sb, int mode)
        return ret;
 }
-#define BOOL_INO_OFFSET 30
 static ssize_t sel_read_bool(struct file *filep, char __user *buf,
                             size_t count, loff_t *ppos)
 {
@@ -806,14 +810,14 @@ static ssize_t sel_read_bool(struct file *filep, char __user *buf,
        }
        inode = filep->f_path.dentry->d_inode;
-        cur_enforcing = security_get_bool_value(inode->i_ino - BOOL_INO_OFFSET);
+        cur_enforcing = security_get_bool_value(inode->i_ino&SEL_INO_MASK);
        if (cur_enforcing < 0) {
                ret = cur_enforcing;
                goto out;
        }
        length = scnprintf(page, PAGE_SIZE, "%d %d", cur_enforcing,
-                          bool_pending_values[inode->i_ino - BOOL_INO_OFFSET]);
+                          bool_pending_values[inode->i_ino&SEL_INO_MASK]);
        ret = simple_read_from_buffer(buf, count, ppos, page, length);
 out:
        mutex_unlock(&sel_mutex);
@@ -865,7 +869,7 @@ static ssize_t sel_write_bool(struct file *filep, const char __user *buf,
                new_value = 1;
        inode = filep->f_path.dentry->d_inode;
-        bool_pending_values[inode->i_ino - BOOL_INO_OFFSET] = new_value;
+        bool_pending_values[inode->i_ino&SEL_INO_MASK] = new_value;
        length = count;
 out:
@@ -1029,7 +1033,7 @@ static int sel_make_bools(void)
                isec->sid = sid;
                isec->initialized = 1;
                inode->i_fop = &sel_bool_ops;
-                inode->i_ino = i + BOOL_INO_OFFSET;
+                inode->i_ino = i|SEL_BOOL_INO_OFFSET;
                d_add(dentry, inode);
        }
        bool_num = num;
@@ -1234,6 +1238,56 @@ static int sel_make_avc_files(struct dentry *dir)
                        goto out;
                }
                inode->i_fop = files[i].ops;
+                inode->i_ino = ++sel_last_ino;
+                d_add(dentry, inode);
+        }
+out:
+        return ret;
+}
+static ssize_t sel_read_initcon(struct file * file, char __user *buf,
+                                size_t count, loff_t *ppos)
+{
+        struct inode *inode;
+        char *con;
+        u32 sid, len;
+        ssize_t ret;
+        inode = file->f_path.dentry->d_inode;
+        sid = inode->i_ino&SEL_INO_MASK;
+        ret = security_sid_to_context(sid, &con, &len);
+        if (ret < 0)
+                return ret;
+        ret = simple_read_from_buffer(buf, count, ppos, con, len);
+        kfree(con);
+        return ret;
+}
+static const struct file_operations sel_initcon_ops = {
+        .read           = sel_read_initcon,
+};
+static int sel_make_initcon_files(struct dentry *dir)
+{
+        int i, ret = 0;
+        for (i = 1; i <= SECINITSID_NUM; i++) {
+                struct inode *inode;
+                struct dentry *dentry;
+                dentry = d_alloc_name(dir, security_get_initial_sid_context(i));
+                if (!dentry) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                inode = sel_make_inode(dir->d_sb, S_IFREG|S_IRUGO);
+                if (!inode) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                inode->i_fop = &sel_initcon_ops;
+                inode->i_ino = i|SEL_INITCON_INO_OFFSET;
                d_add(dentry, inode);
        }
 out:
@@ -1252,6 +1306,7 @@ static int sel_make_dir(struct inode *dir, struct dentry *dentry)
        }
        inode->i_op = &simple_dir_inode_operations;
        inode->i_fop = &simple_dir_operations;
+        inode->i_ino = ++sel_last_ino;
        /* directory inodes start off with i_nlink == 2 (for "." entry) */
        inc_nlink(inode);
        d_add(dentry, inode);
@@ -1314,6 +1369,7 @@ static int sel_fill_super(struct super_block * sb, void * data, int silent)
                ret = -ENOMEM;
                goto err;
        }
+        inode->i_ino = ++sel_last_ino;
        isec = (struct inode_security_struct*)inode->i_security;
        isec->sid = SECINITSID_DEVNULL;
        isec->sclass = SECCLASS_CHR_FILE;
@@ -1336,6 +1392,21 @@ static int sel_fill_super(struct super_block * sb, void * data, int silent)
        ret = sel_make_avc_files(dentry);
        if (ret)
                goto err;
+        dentry = d_alloc_name(sb->s_root, "initial_contexts");
+        if (!dentry) {
+                ret = -ENOMEM;
+                goto err;
+        }
+        ret = sel_make_dir(root_inode, dentry);
+        if (ret)
+                goto err;
+        ret = sel_make_initcon_files(dentry);
+        if (ret)
+                goto err;
 out:
        return ret;
 err:
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index 1e52356664d6..40660ffd49b6 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -39,7 +39,6 @@
 #include <linux/sched.h>
 #include <linux/audit.h>
 #include <linux/mutex.h>
-#include <net/sock.h>
 #include <net/netlabel.h>
 #include "flask.h"
@@ -53,7 +52,7 @@
 #include "conditional.h"
 #include "mls.h"
 #include "objsec.h"
-#include "selinux_netlabel.h"
+#include "netlabel.h"
 #include "xfrm.h"
 #include "ebitmap.h"
@@ -594,6 +593,13 @@ static int context_struct_to_string(struct context *context, char **scontext, u3
 #include "initial_sid_to_string.h"
+const char *security_get_initial_sid_context(u32 sid)
+{
+        if (unlikely(sid > SECINITSID_NUM))
+                return NULL;
+        return initial_sid_to_string[sid];
+}
 /**
 * security_sid_to_context - Obtain a context for a given SID.
 * @sid: security identifier, SID
@@ -1050,6 +1056,8 @@ static int validate_classes(struct policydb *p)
        for (i = 1; i < kdefs->cts_len; i++) {
                def_class = kdefs->class_to_string[i];
+                if (!def_class)
+                        continue;
                if (i > p->p_classes.nprim) {
                        printk(KERN_INFO
                               "security:  class %s not defined in policy\n",
@@ -1249,6 +1257,7 @@ bad:
 }
 extern void selinux_complete_init(void);
+static int security_preserve_bools(struct policydb *p);
 /**
 * security_load_policy - Load a security policy configuration.
@@ -1325,6 +1334,12 @@ int security_load_policy(void *data, size_t len)
                goto err;
        }
+        rc = security_preserve_bools(&newpolicydb);
+        if (rc) {
+                printk(KERN_ERR "security:  unable to preserve booleans\n");
+                goto err;
+        }
        /* Clone the SID table. */
        sidtab_shutdown(&sidtab);
        if (sidtab_map(&sidtab, clone_sid, &newsidtab)) {
@@ -1882,6 +1897,37 @@ out:
        return rc;
 }
+static int security_preserve_bools(struct policydb *p)
+{
+        int rc, nbools = 0, *bvalues = NULL, i;
+        char **bnames = NULL;
+        struct cond_bool_datum *booldatum;
+        struct cond_node *cur;
+        rc = security_get_bools(&nbools, &bnames, &bvalues);
+        if (rc)
+                goto out;
+        for (i = 0; i < nbools; i++) {
+                booldatum = hashtab_search(p->p_bools.table, bnames[i]);
+                if (booldatum)
+                        booldatum->state = bvalues[i];
+        }
+        for (cur = p->cond_list; cur != NULL; cur = cur->next) {
+                rc = evaluate_cond_node(p, cur);
+                if (rc)
+                        goto out;
+        }
+out:
+        if (bnames) {
+                for (i = 0; i < nbools; i++)
+                        kfree(bnames[i]);
+        }
+        kfree(bnames);
+        kfree(bvalues);
+        return rc;
+}
 /*
 * security_sid_mls_copy() - computes a new sid based on the given
 * sid and the mls portion of mls_sid.
@@ -2198,41 +2244,15 @@ void selinux_audit_set_callback(int (*callback)(void))
        aurule_callback = callback;
 }
-/**
- * security_skb_extlbl_sid - Determine the external label of a packet
- * @skb: the packet
- * @base_sid: the SELinux SID to use as a context for MLS only external labels
- * @sid: the packet's SID
- *
- * Description:
- * Check the various different forms of external packet labeling and determine
- * the external SID for the packet.
- *
- */
-void security_skb_extlbl_sid(struct sk_buff *skb, u32 base_sid, u32 *sid)
-{
-        u32 xfrm_sid;
-        u32 nlbl_sid;
-        selinux_skb_xfrm_sid(skb, &xfrm_sid);
-        if (selinux_netlbl_skbuff_getsid(skb,
-                                         (xfrm_sid == SECSID_NULL ?
-                                          base_sid : xfrm_sid),
-                                         &nlbl_sid) != 0)
-                nlbl_sid = SECSID_NULL;
-        *sid = (nlbl_sid == SECSID_NULL ? xfrm_sid : nlbl_sid);
-}
 #ifdef CONFIG_NETLABEL
 /*
- * This is the structure we store inside the NetLabel cache block.
+ * NetLabel cache structure
 */
-#define NETLBL_CACHE(x)           ((struct netlbl_cache *)(x))
+#define NETLBL_CACHE(x)           ((struct selinux_netlbl_cache *)(x))
 #define NETLBL_CACHE_T_NONE       0
 #define NETLBL_CACHE_T_SID        1
 #define NETLBL_CACHE_T_MLS        2
-struct netlbl_cache {
+struct selinux_netlbl_cache {
        u32 type;
        union {
                u32 sid;
@@ -2241,7 +2261,7 @@ struct netlbl_cache {
 };
 /**
- * selinux_netlbl_cache_free - Free the NetLabel cached data
+ * security_netlbl_cache_free - Free the NetLabel cached data
 * @data: the data to free
 *
 * Description:
@@ -2249,9 +2269,9 @@ struct netlbl_cache {
 * netlbl_lsm_cache structure.
 *
 */
-static void selinux_netlbl_cache_free(const void *data)
+static void security_netlbl_cache_free(const void *data)
 {
-        struct netlbl_cache *cache;
+        struct selinux_netlbl_cache *cache;
        if (data == NULL)
                return;
@@ -2266,33 +2286,33 @@ static void selinux_netlbl_cache_free(const void *data)
 }
 /**
- * selinux_netlbl_cache_add - Add an entry to the NetLabel cache
+ * security_netlbl_cache_add - Add an entry to the NetLabel cache
- * @skb: the packet
+ * @secattr: the NetLabel packet security attributes
 * @ctx: the SELinux context
 *
 * Description:
 * Attempt to cache the context in @ctx, which was derived from the packet in
- * @skb, in the NetLabel subsystem cache.
+ * @skb, in the NetLabel subsystem cache.  This function assumes @secattr has
+ * already been initialized.
 *
 */
-static void selinux_netlbl_cache_add(struct sk_buff *skb, struct context *ctx)
+static void security_netlbl_cache_add(struct netlbl_lsm_secattr *secattr,
+                                      struct context *ctx)
 {
-        struct netlbl_cache *cache = NULL;
+        struct selinux_netlbl_cache *cache = NULL;
-        struct netlbl_lsm_secattr secattr;
-        netlbl_secattr_init(&secattr);
+        secattr->cache = netlbl_secattr_cache_alloc(GFP_ATOMIC);
-        secattr.cache = netlbl_secattr_cache_alloc(GFP_ATOMIC);
+        if (secattr->cache == NULL)
-        if (secattr.cache == NULL)
+                return;
-                goto netlbl_cache_add_return;
        cache = kzalloc(sizeof(*cache), GFP_ATOMIC);
        if (cache == NULL)
-                goto netlbl_cache_add_return;
+                return;
        cache->type = NETLBL_CACHE_T_MLS;
        if (ebitmap_cpy(&cache->data.mls_label.level[0].cat,
                        &ctx->range.level[0].cat) != 0)
-                goto netlbl_cache_add_return;
+                return;
        cache->data.mls_label.level[1].cat.highbit =
                cache->data.mls_label.level[0].cat.highbit;
        cache->data.mls_label.level[1].cat.node =
@@ -2300,52 +2320,40 @@ static void selinux_netlbl_cache_add(struct sk_buff *skb, struct context *ctx)
        cache->data.mls_label.level[0].sens = ctx->range.level[0].sens;
        cache->data.mls_label.level[1].sens = ctx->range.level[0].sens;
-        secattr.cache->free = selinux_netlbl_cache_free;
+        secattr->cache->free = security_netlbl_cache_free;
-        secattr.cache->data = (void *)cache;
+        secattr->cache->data = (void *)cache;
-        secattr.flags = NETLBL_SECATTR_CACHE;
+        secattr->flags |= NETLBL_SECATTR_CACHE;
-        netlbl_cache_add(skb, &secattr);
-netlbl_cache_add_return:
-        netlbl_secattr_destroy(&secattr);
 }
 /**
- * selinux_netlbl_cache_invalidate - Invalidate the NetLabel cache
+ * security_netlbl_secattr_to_sid - Convert a NetLabel secattr to a SELinux SID
- *
- * Description:
- * Invalidate the NetLabel security attribute mapping cache.
- *
- */
-void selinux_netlbl_cache_invalidate(void)
-{
-        netlbl_cache_invalidate();
-}
-/**
- * selinux_netlbl_secattr_to_sid - Convert a NetLabel secattr to a SELinux SID
- * @skb: the network packet
 * @secattr: the NetLabel packet security attributes
 * @base_sid: the SELinux SID to use as a context for MLS only attributes
 * @sid: the SELinux SID
 *
 * Description:
- * Convert the given NetLabel packet security attributes in @secattr into a
+ * Convert the given NetLabel security attributes in @secattr into a
 * SELinux SID.  If the @secattr field does not contain a full SELinux
- * SID/context then use the context in @base_sid as the foundation.  If @skb
+ * SID/context then use the context in @base_sid as the foundation.  If
- * is not NULL attempt to cache as much data as possibile.  Returns zero on
+ * possibile the 'cache' field of @secattr is set and the CACHE flag is set;
- * success, negative values on failure.
+ * this is to allow the @secattr to be used by NetLabel to cache the secattr to
+ * SID conversion for future lookups.  Returns zero on success, negative
+ * values on failure.
 *
 */
-static int selinux_netlbl_secattr_to_sid(struct sk_buff *skb,
+int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr,
-                                         struct netlbl_lsm_secattr *secattr,
+                                   u32 base_sid,
-                                         u32 base_sid,
+                                   u32 *sid)
-                                         u32 *sid)
 {
        int rc = -EIDRM;
        struct context *ctx;
        struct context ctx_new;
-        struct netlbl_cache *cache;
+        struct selinux_netlbl_cache *cache;
+        if (!ss_initialized) {
+                *sid = SECSID_NULL;
+                return 0;
+        }
        POLICY_RDLOCK;
@@ -2410,8 +2418,8 @@ static int selinux_netlbl_secattr_to_sid(struct sk_buff *skb,
                if (rc != 0)
                        goto netlbl_secattr_to_sid_return_cleanup;
-                if (skb != NULL)
+                security_netlbl_cache_add(secattr, &ctx_new);
-                        selinux_netlbl_cache_add(skb, &ctx_new);
                ebitmap_destroy(&ctx_new.range.level[0].cat);
        } else {
                *sid = SECSID_NULL;
@@ -2427,338 +2435,43 @@ netlbl_secattr_to_sid_return_cleanup:
 }
 /**
- * selinux_netlbl_skbuff_getsid - Get the sid of a packet using NetLabel
+ * security_netlbl_sid_to_secattr - Convert a SELinux SID to a NetLabel secattr
- * @skb: the packet
+ * @sid: the SELinux SID
- * @base_sid: the SELinux SID to use as a context for MLS only attributes
+ * @secattr: the NetLabel packet security attributes
- * @sid: the SID
- *
- * Description:
- * Call the NetLabel mechanism to get the security attributes of the given
- * packet and use those attributes to determine the correct context/SID to
- * assign to the packet.  Returns zero on success, negative values on failure.
- *
- */
-int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid)
-{
-        int rc;
-        struct netlbl_lsm_secattr secattr;
-        netlbl_secattr_init(&secattr);
-        rc = netlbl_skbuff_getattr(skb, &secattr);
-        if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE)
-                rc = selinux_netlbl_secattr_to_sid(skb,
-                                                   &secattr,
-                                                   base_sid,
-                                                   sid);
-        else
-                *sid = SECSID_NULL;
-        netlbl_secattr_destroy(&secattr);
-        return rc;
-}
-/**
- * selinux_netlbl_socket_setsid - Label a socket using the NetLabel mechanism
- * @sock: the socket to label
- * @sid: the SID to use
 *
 * Description:
- * Attempt to label a socket using the NetLabel mechanism using the given
+ * Convert the given SELinux SID in @sid into a NetLabel security attribute.
- * SID.  Returns zero values on success, negative values on failure.  The
+ * Returns zero on success, negative values on failure.
- * caller is responsibile for calling rcu_read_lock() before calling this
- * this function and rcu_read_unlock() after this function returns.
 *
 */
-static int selinux_netlbl_socket_setsid(struct socket *sock, u32 sid)
+int security_netlbl_sid_to_secattr(u32 sid, struct netlbl_lsm_secattr *secattr)
 {
        int rc = -ENOENT;
-        struct sk_security_struct *sksec = sock->sk->sk_security;
-        struct netlbl_lsm_secattr secattr;
        struct context *ctx;
+        netlbl_secattr_init(secattr);
        if (!ss_initialized)
                return 0;
-        netlbl_secattr_init(&secattr);
        POLICY_RDLOCK;
        ctx = sidtab_search(&sidtab, sid);
        if (ctx == NULL)
-                goto netlbl_socket_setsid_return;
+                goto netlbl_sid_to_secattr_failure;
+        secattr->domain = kstrdup(policydb.p_type_val_to_name[ctx->type - 1],
-        secattr.domain = kstrdup(policydb.p_type_val_to_name[ctx->type - 1],
+                                  GFP_ATOMIC);
-                                 GFP_ATOMIC);
+        secattr->flags |= NETLBL_SECATTR_DOMAIN;
-        secattr.flags |= NETLBL_SECATTR_DOMAIN;
+        mls_export_netlbl_lvl(ctx, secattr);
-        mls_export_netlbl_lvl(ctx, &secattr);
+        rc = mls_export_netlbl_cat(ctx, secattr);
-        rc = mls_export_netlbl_cat(ctx, &secattr);
        if (rc != 0)
-                goto netlbl_socket_setsid_return;
+                goto netlbl_sid_to_secattr_failure;
-        rc = netlbl_socket_setattr(sock, &secattr);
-        if (rc == 0) {
-                spin_lock_bh(&sksec->nlbl_lock);
-                sksec->nlbl_state = NLBL_LABELED;
-                spin_unlock_bh(&sksec->nlbl_lock);
-        }
-netlbl_socket_setsid_return:
        POLICY_RDUNLOCK;
-        netlbl_secattr_destroy(&secattr);
-        return rc;
-}
-/**
- * selinux_netlbl_sk_security_reset - Reset the NetLabel fields
- * @ssec: the sk_security_struct
- * @family: the socket family
- *
- * Description:
- * Called when the NetLabel state of a sk_security_struct needs to be reset.
- * The caller is responsibile for all the NetLabel sk_security_struct locking.
- *
- */
-void selinux_netlbl_sk_security_reset(struct sk_security_struct *ssec,
-                                      int family)
-{
-        if (family == PF_INET)
-                ssec->nlbl_state = NLBL_REQUIRE;
-        else
-                ssec->nlbl_state = NLBL_UNSET;
-}
-/**
+        return 0;
- * selinux_netlbl_sk_security_init - Setup the NetLabel fields
- * @ssec: the sk_security_struct
- * @family: the socket family
- *
- * Description:
- * Called when a new sk_security_struct is allocated to initialize the NetLabel
- * fields.
- *
- */
-void selinux_netlbl_sk_security_init(struct sk_security_struct *ssec,
-                                     int family)
-{
-        /* No locking needed, we are the only one who has access to ssec */
-        selinux_netlbl_sk_security_reset(ssec, family);
-        spin_lock_init(&ssec->nlbl_lock);
-}
-/**
- * selinux_netlbl_sk_security_clone - Copy the NetLabel fields
- * @ssec: the original sk_security_struct
- * @newssec: the cloned sk_security_struct
- *
- * Description:
- * Clone the NetLabel specific sk_security_struct fields from @ssec to
- * @newssec.
- *
- */
-void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec,
-                                      struct sk_security_struct *newssec)
-{
-        /* We don't need to take newssec->nlbl_lock because we are the only
-         * thread with access to newssec, but we do need to take the RCU read
-         * lock as other threads could have access to ssec */
-        rcu_read_lock();
-        selinux_netlbl_sk_security_reset(newssec, ssec->sk->sk_family);
-        newssec->sclass = ssec->sclass;
-        rcu_read_unlock();
-}
-/**
- * selinux_netlbl_socket_post_create - Label a socket using NetLabel
- * @sock: the socket to label
- *
- * Description:
- * Attempt to label a socket using the NetLabel mechanism using the given
- * SID.  Returns zero values on success, negative values on failure.
- *
- */
-int selinux_netlbl_socket_post_create(struct socket *sock)
-{
-        int rc = 0;
-        struct inode_security_struct *isec = SOCK_INODE(sock)->i_security;
-        struct sk_security_struct *sksec = sock->sk->sk_security;
-        sksec->sclass = isec->sclass;
-        rcu_read_lock();
-        if (sksec->nlbl_state == NLBL_REQUIRE)
-                rc = selinux_netlbl_socket_setsid(sock, sksec->sid);
-        rcu_read_unlock();
-        return rc;
-}
-/**
- * selinux_netlbl_sock_graft - Netlabel the new socket
- * @sk: the new connection
- * @sock: the new socket
- *
- * Description:
- * The connection represented by @sk is being grafted onto @sock so set the
- * socket's NetLabel to match the SID of @sk.
- *
- */
-void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock)
-{
-        struct inode_security_struct *isec = SOCK_INODE(sock)->i_security;
-        struct sk_security_struct *sksec = sk->sk_security;
-        struct netlbl_lsm_secattr secattr;
-        u32 nlbl_peer_sid;
-        sksec->sclass = isec->sclass;
-        rcu_read_lock();
-        if (sksec->nlbl_state != NLBL_REQUIRE) {
-                rcu_read_unlock();
-                return;
-        }
-        netlbl_secattr_init(&secattr);
-        if (netlbl_sock_getattr(sk, &secattr) == 0 &&
-            secattr.flags != NETLBL_SECATTR_NONE &&
-            selinux_netlbl_secattr_to_sid(NULL,
-                                          &secattr,
-                                          SECINITSID_UNLABELED,
-                                          &nlbl_peer_sid) == 0)
-                sksec->peer_sid = nlbl_peer_sid;
-        netlbl_secattr_destroy(&secattr);
-        /* Try to set the NetLabel on the socket to save time later, if we fail
-         * here we will pick up the pieces in later calls to
-         * selinux_netlbl_inode_permission(). */
-        selinux_netlbl_socket_setsid(sock, sksec->sid);
-        rcu_read_unlock();
-}
-/**
- * selinux_netlbl_inode_permission - Verify the socket is NetLabel labeled
- * @inode: the file descriptor's inode
- * @mask: the permission mask
- *
- * Description:
- * Looks at a file's inode and if it is marked as a socket protected by
- * NetLabel then verify that the socket has been labeled, if not try to label
- * the socket now with the inode's SID.  Returns zero on success, negative
- * values on failure.
- *
- */
-int selinux_netlbl_inode_permission(struct inode *inode, int mask)
-{
-        int rc;
-        struct sk_security_struct *sksec;
-        struct socket *sock;
-        if (!S_ISSOCK(inode->i_mode) ||
-            ((mask & (MAY_WRITE | MAY_APPEND)) == 0))
-                return 0;
-        sock = SOCKET_I(inode);
-        sksec = sock->sk->sk_security;
-        rcu_read_lock();
-        if (sksec->nlbl_state != NLBL_REQUIRE) {
-                rcu_read_unlock();
-                return 0;
-        }
-        local_bh_disable();
-        bh_lock_sock_nested(sock->sk);
-        rc = selinux_netlbl_socket_setsid(sock, sksec->sid);
-        bh_unlock_sock(sock->sk);
-        local_bh_enable();
-        rcu_read_unlock();
-        return rc;
-}
-/**
- * selinux_netlbl_sock_rcv_skb - Do an inbound access check using NetLabel
- * @sksec: the sock's sk_security_struct
- * @skb: the packet
- * @ad: the audit data
- *
- * Description:
- * Fetch the NetLabel security attributes from @skb and perform an access check
- * against the receiving socket.  Returns zero on success, negative values on
- * error.
- *
- */
-int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
-                                struct sk_buff *skb,
-                                struct avc_audit_data *ad)
-{
-        int rc;
-        u32 netlbl_sid;
-        u32 recv_perm;
-        rc = selinux_netlbl_skbuff_getsid(skb,
-                                          SECINITSID_UNLABELED,
-                                          &netlbl_sid);
-        if (rc != 0)
-                return rc;
-        if (netlbl_sid == SECSID_NULL)
-                return 0;
-        switch (sksec->sclass) {
-        case SECCLASS_UDP_SOCKET:
-                recv_perm = UDP_SOCKET__RECVFROM;
-                break;
-        case SECCLASS_TCP_SOCKET:
-                recv_perm = TCP_SOCKET__RECVFROM;
-                break;
-        default:
-                recv_perm = RAWIP_SOCKET__RECVFROM;
-        }
-        rc = avc_has_perm(sksec->sid,
-                          netlbl_sid,
-                          sksec->sclass,
-                          recv_perm,
-                          ad);
-        if (rc == 0)
-                return 0;
-        netlbl_skbuff_err(skb, rc);
-        return rc;
-}
-/**
- * selinux_netlbl_socket_setsockopt - Do not allow users to remove a NetLabel
- * @sock: the socket
- * @level: the socket level or protocol
- * @optname: the socket option name
- *
- * Description:
- * Check the setsockopt() call and if the user is trying to replace the IP
- * options on a socket and a NetLabel is in place for the socket deny the
- * access; otherwise allow the access.  Returns zero when the access is
- * allowed, -EACCES when denied, and other negative values on error.
- *
- */
-int selinux_netlbl_socket_setsockopt(struct socket *sock,
-                                     int level,
-                                     int optname)
-{
-        int rc = 0;
-        struct sk_security_struct *sksec = sock->sk->sk_security;
-        struct netlbl_lsm_secattr secattr;
-        rcu_read_lock();
-        if (level == IPPROTO_IP && optname == IP_OPTIONS &&
-            sksec->nlbl_state == NLBL_LABELED) {
-                netlbl_secattr_init(&secattr);
-                rc = netlbl_socket_getattr(sock, &secattr);
-                if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE)
-                        rc = -EACCES;
-                netlbl_secattr_destroy(&secattr);
-        }
-        rcu_read_unlock();
+netlbl_sid_to_secattr_failure:
+        POLICY_RDUNLOCK;
+        netlbl_secattr_destroy(secattr);
        return rc;
 }
 #endif /* CONFIG_NETLABEL */