/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* alloc.c
*
* Extent allocs and frees
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/swap.h>
#define MLOG_MASK_PREFIX ML_DISK_ALLOC
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
#include "aops.h"
#include "dlmglue.h"
#include "extent_map.h"
#include "inode.h"
#include "journal.h"
#include "localalloc.h"
#include "suballoc.h"
#include "sysfile.h"
#include "file.h"
#include "super.h"
#include "uptodate.h"
#include "buffer_head_io.h"
/*
* ocfs2_extent_tree and ocfs2_extent_tree_operations are used to abstract
* the b-tree operations in ocfs2. Now all the b-tree operations are not
* limited to ocfs2_dinode only. Any data which need to allocate clusters
* to store can use b-tree. And it only needs to implement its ocfs2_extent_tree
* and operation.
*
* ocfs2_extent_tree contains info for the root of the b-tree, it must have a
* root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
* functions.
* ocfs2_extent_tree_operations abstract the normal operations we do for
* the root of extent b-tree.
*/
struct ocfs2_extent_tree;
struct ocfs2_extent_tree_operations {
void (*set_last_eb_blk) (struct ocfs2_extent_tree *et, u64 blkno);
u64 (*get_last_eb_blk) (struct ocfs2_extent_tree *et);
void (*update_clusters) (struct inode *inode,
struct ocfs2_extent_tree *et,
u32 new_clusters);
int (*sanity_check) (struct inode *inode, struct ocfs2_extent_tree *et);
};
struct ocfs2_extent_tree {
enum ocfs2_extent_tree_type type;
struct ocfs2_extent_tree_operations *eops;
struct buffer_head *root_bh;
struct ocfs2_extent_list *root_el;
};
static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
u64 blkno)
{
struct ocfs2_dinode *di = (struct ocfs2_dinode *)et->root_bh->b_data;
BUG_ON(et->type != OCFS2_DINODE_EXTENT);
di->i_last_eb_blk = cpu_to_le64(blkno);
}
static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
{
struct ocfs2_dinode *di = (struct ocfs2_dinode *)et->root_bh->b_data;
BUG_ON(et->type != OCFS2_DINODE_EXTENT);
return le64_to_cpu(di->i_last_eb_blk);
}
static void ocfs2_dinode_update_clusters(struct inode *inode,
struct ocfs2_extent_tree *et,
u32 clusters)
{
struct ocfs2_dinode *di =
(struct ocfs2_dinode *)et->root_bh->b_data;
le32_add_cpu(&di->i_clusters, clusters);
spin_lock(&OCFS2_I(inode)->ip_lock);
OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
spin_unlock(&OCFS2_I(inode)->ip_lock);
}
static int ocfs2_dinode_sanity_check(struct inode *inode,
struct ocfs2_extent_tree *et)
{
int ret = 0;
struct ocfs2_dinode *di;
BUG_ON(et->type != OCFS2_DINODE_EXTENT);
di = (struct ocfs2_dinode *)et->root_bh->b_data;
if (!OCFS2_IS_VALID_DINODE(di)) {
ret = -EIO;
ocfs2_error(inode->i_sb,
"Inode %llu has invalid path root",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
}
return ret;
}
static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
.set_last_eb_blk = ocfs2_dinode_set_last_eb_blk,
.get_last_eb_blk = ocfs2_dinode_get_last_eb_blk,
.update_clusters = ocfs2_dinode_update_clusters,
.sanity_check = ocfs2_dinode_sanity_check,
};
static struct ocfs2_extent_tree*
ocfs2_new_extent_tree(struct buffer_head *bh,
enum ocfs2_extent_tree_type et_type)
{
struct ocfs2_extent_tree *et;
et = kzalloc(sizeof(*et), GFP_NOFS);
if (!et)
return NULL;
et->type = et_type;
get_bh(bh);
et->root_bh = bh;
/* current we only support dinode extent. */
BUG_ON(et->type != OCFS2_DINODE_EXTENT);
if (et_type == OCFS2_DINODE_EXTENT) {
et->root_el = &((struct ocfs2_dinode *)bh->b_data)->id2.i_list;
et->eops = &ocfs2_dinode_et_ops;
}
return et;
}
static void ocfs2_free_extent_tree(struct ocfs2_extent_tree *et)
{
if (et) {
brelse(et->root_bh);
kfree(et);
}
}
static inline void ocfs2_set_last_eb_blk(struct ocfs2_extent_tree *et,
u64 new_last_eb_blk)
{
et->eops->set_last_eb_blk(et, new_last_eb_blk);
}
static inline u64 ocfs2_get_last_eb_blk(struct ocfs2_extent_tree *et)
{
return et->eops->get_last_eb_blk(et);
}
static inline void ocfs2_update_clusters(struct inode *inode,
struct ocfs2_extent_tree *et,
u32 clusters)
{
et->eops->update_clusters(inode, et, clusters);
}
static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
struct ocfs2_extent_block *eb);
/*
* Structures which describe a path through a btree, and functions to
* manipulate them.
*
* The idea here is to be as generic as possible with the tree
* manipulation code.
*/
struct ocfs2_path_item {
struct buffer_head *bh;
struct ocfs2_extent_list *el;
};
#define OCFS2_MAX_PATH_DEPTH 5
struct ocfs2_path {
int p_tree_depth;
struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH];
};
#define path_root_bh(_path) ((_path)->p_node[0].bh)
#define path_root_el(_path) ((_path)->p_node[0].el)
#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
#define path_num_items(_path) ((_path)->p_tree_depth + 1)
/*
* Reset the actual path elements so that we can re-use the structure
* to build another path. Generally, this involves freeing the buffer
* heads.
*/
static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
{
int i, start = 0, depth = 0;
struct ocfs2_path_item *node;
if (keep_root)
start = 1;
for(i = start; i < path_num_items(path); i++) {
node = &path->p_node[i];
brelse(node->bh);
node->bh = NULL;
node->el = NULL;
}
/*
* Tree depth may change during truncate, or insert. If we're
* keeping the root extent list, then make sure that our path
* structure reflects the proper depth.
*/
if (keep_root)
depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
path->p_tree_depth = depth;
}
static void ocfs2_free_path(struct ocfs2_path *path)
{
if (path) {
ocfs2_reinit_path(path, 0);
kfree(path);
}
}
/*
* All the elements of src into dest. After this call, src could be freed
* without affecting dest.
*
* Both paths should have the same root. Any non-root elements of dest
* will be freed.
*/
static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
{
int i;
BUG_ON(path_root_bh(dest) != path_root_bh(src));
BUG_ON(path_root_el(dest) != path_root_el(src));
ocfs2_reinit_path(dest, 1);
for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
dest->p_node[i].bh = src->p_node[i].bh;
dest->p_node[i].el = src->p_node[i].el;
if (dest->p_node[i].bh)
get_bh(dest->p_node[i].bh);
}
}
/*
* Make the *dest path the same as src and re-initialize src path to
* have a root only.
*/
static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
{
int i;
BUG_ON(path_root_bh(dest) != path_root_bh(src));
for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
brelse(dest->p_node[i].bh);
dest->p_node[i].bh = src->p_node[i].bh;
dest->p_node[i].el = src->p_node[i].el;
src->p_node[i].bh = NULL;
src->p_node[i].el = NULL;
}
}
/*
* Insert an extent block at given index.
*
* This will not take an additional reference on eb_bh.
*/
static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
struct buffer_head *eb_bh)
{
struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
/*
* Right now, no root bh is an extent block, so this helps
* catch code errors with dinode trees. The assertion can be
* safely removed if we ever need to insert extent block
* structures at the root.
*/
BUG_ON(index == 0);
path->p_node[index].bh = eb_bh;
path->p_node[index].el = &eb->h_list;
}
static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
struct ocfs2_extent_list *root_el)
{
struct ocfs2_path *path;
BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
path = kzalloc(sizeof(*path), GFP_NOFS);
if (path) {
path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
get_bh(root_bh);
path_root_bh(path) = root_bh;
path_root_el(path) = root_el;
}
return path;
}
/*
* Convenience function to journal all components in a path.
*/
static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
struct ocfs2_path *path)
{
int i, ret = 0;
if (!path)
goto out;
for(i = 0; i < path_num_items(path); i++) {
ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
}
out:
return ret;
}
/*
* Return the index of the extent record which contains cluster #v_cluster.
* -1 is returned if it was not found.
*
* Should work fine on interior and exterior nodes.
*/
int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
{
int ret = -1;
int i;
struct ocfs2_extent_rec *rec;
u32 rec_end, rec_start, clusters;
for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
rec = &el->l_recs[i];
rec_start = le32_to_cpu(rec->e_cpos);
clusters = ocfs2_rec_clusters(el, rec);
rec_end = rec_start + clusters;
if (v_cluster >= rec_start && v_cluster < rec_end) {
ret = i;
break;
}
}
return ret;
}
enum ocfs2_contig_type {
CONTIG_NONE = 0,
CONTIG_LEFT,
CONTIG_RIGHT,
CONTIG_LEFTRIGHT,
};
/*
* NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
* ocfs2_extent_contig only work properly against leaf nodes!
*/
static int ocfs2_block_extent_contig(struct super_block *sb,
struct ocfs2_extent_rec *ext,
u64 blkno)
{
u64 blk_end = le64_to_cpu(ext->e_blkno);
blk_end += ocfs2_clusters_to_blocks(sb,
le16_to_cpu(ext->e_leaf_clusters));
return blkno == blk_end;
}
static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
struct ocfs2_extent_rec *right)
{
u32 left_range;
left_range = le32_to_cpu(left->e_cpos) +
le16_to_cpu(left->e_leaf_clusters);
return (left_range == le32_to_cpu(right->e_cpos));
}
static enum ocfs2_contig_type
ocfs2_extent_contig(struct inode *inode,
struct ocfs2_extent_rec *ext,
struct ocfs2_extent_rec *insert_rec)
{
u64 blkno = le64_to_cpu(insert_rec->e_blkno);
/*
* Refuse to coalesce extent records with different flag
* fields - we don't want to mix unwritten extents with user
* data.
*/
if (ext->e_flags != insert_rec->e_flags)
return CONTIG_NONE;
if (ocfs2_extents_adjacent(ext, insert_rec) &&
ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
return CONTIG_RIGHT;
blkno = le64_to_cpu(ext->e_blkno);
if (ocfs2_extents_adjacent(insert_rec, ext) &&
ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
return CONTIG_LEFT;
return CONTIG_NONE;
}
/*
* NOTE: We can have pretty much any combination of contiguousness and
* appending.
*
* The usefulness of APPEND_TAIL is more in that it lets us know that
* we'll have to update the path to that leaf.
*/
enum ocfs2_append_type {
APPEND_NONE = 0,
APPEND_TAIL,
};
enum ocfs2_split_type {
SPLIT_NONE = 0,
SPLIT_LEFT,
SPLIT_RIGHT,
};
struct ocfs2_insert_type {
enum ocfs2_split_type ins_split;
enum ocfs2_append_type ins_appending;
enum ocfs2_contig_type ins_contig;
int ins_contig_index;
int ins_tree_depth;
};
struct ocfs2_merge_ctxt {
enum ocfs2_contig_type c_contig_type;
int c_has_empty_extent;
int c_split_covers_rec;
};
/*
* How many free extents have we got before we need more meta data?
*/
int ocfs2_num_free_extents(struct ocfs2_super *osb,
struct inode *inode,
struct buffer_head *root_bh,
enum ocfs2_extent_tree_type type)
{
int retval;
struct ocfs2_extent_list *el = NULL;
struct ocfs2_extent_block *eb;
struct buffer_head *eb_bh = NULL;
u64 last_eb_blk = 0;
mlog_entry_void();
if (type == OCFS2_DINODE_EXTENT) {
struct ocfs2_dinode *fe =
(struct ocfs2_dinode *)root_bh->b_data;
if (!OCFS2_IS_VALID_DINODE(fe)) {
OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
retval = -EIO;
goto bail;
}
if (fe->i_last_eb_blk)
last_eb_blk = le64_to_cpu(fe->i_last_eb_blk);
el = &fe->id2.i_list;
}
if (last_eb_blk) {
retval = ocfs2_read_block(osb, last_eb_blk,
&eb_bh, OCFS2_BH_CACHED, inode);
if (retval < 0) {
mlog_errno(retval);
goto bail;
}
eb = (struct ocfs2_extent_block *) eb_bh->b_data;
el = &eb->h_list;
}
BUG_ON(el->l_tree_depth != 0);
retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
bail:
if (eb_bh)
brelse(eb_bh);
mlog_exit(retval);
return retval;
}
/* expects array to already be allocated
*
* sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
* l_count for you
*/
static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
handle_t *handle,
struct inode *inode,
int wanted,
struct ocfs2_alloc_context *meta_ac,
struct buffer_head *bhs[])
{
int count, status, i;
u16 suballoc_bit_start;
u32 num_got;
u64 first_blkno;
struct ocfs2_extent_block *eb;
mlog_entry_void();
count = 0;
while (count < wanted) {
status = ocfs2_claim_metadata(osb,
handle,
meta_ac,
wanted - count,
&suballoc_bit_start,
&num_got,
&first_blkno);
if (status < 0) {
mlog_errno(status);
goto bail;
}
for(i = count; i < (num_got + count); i++) {
bhs[i] = sb_getblk(osb->sb, first_blkno);
if (bhs[i] == NULL) {
status = -EIO;
mlog_errno(status);
goto bail;
}
ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
status = ocfs2_journal_access(handle, inode, bhs[i],
OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
/* Ok, setup the minimal stuff here. */
strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
eb->h_blkno = cpu_to_le64(first_blkno);
eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
eb->h_list.l_count =
cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
suballoc_bit_start++;
first_blkno++;
/* We'll also be dirtied by the caller, so
* this isn't absolutely necessary. */
status = ocfs2_journal_dirty(handle, bhs[i]);
if (status < 0) {
mlog_errno(status);
goto bail;
}
}
count += num_got;
}
status = 0;
bail:
if (status < 0) {
for(i = 0; i < wanted; i++) {
if (bhs[i])
brelse(bhs[i]);
bhs[i] = NULL;
}
}
mlog_exit(status);
return status;
}
/*
* Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
*
* Returns the sum of the rightmost extent rec logical offset and
* cluster count.
*
* ocfs2_add_branch() uses this to determine what logical cluster
* value should be populated into the leftmost new branch records.
*
* ocfs2_shift_tree_depth() uses this to determine the # clusters
* value for the new topmost tree record.
*/
static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el)
{
int i;
i = le16_to_cpu(el->l_next_free_rec) - 1;
return le32_to_cpu(el->l_recs[i].e_cpos) +
ocfs2_rec_clusters(el, &el->l_recs[i]);
}
/*
* Add an entire tree branch to our inode. eb_bh is the extent block
* to start at, if we don't want to start the branch at the dinode
* structure.
*
* last_eb_bh is required as we have to update it's next_leaf pointer
* for the new last extent block.
*
* the new branch will be 'empty' in the sense that every block will
* contain a single record with cluster count == 0.
*/
static int ocfs2_add_branch(struct ocfs2_super *osb,
handle_t *handle,
struct inode *inode,
struct ocfs2_extent_tree *et,
struct buffer_head *eb_bh,
struct buffer_head **last_eb_bh,
struct ocfs2_alloc_context *meta_ac)
{
int status, new_blocks, i;
u64 next_blkno, new_last_eb_blk;
struct buffer_head *bh;
struct buffer_head **new_eb_bhs = NULL;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *eb_el;
struct ocfs2_extent_list *el;
u32 new_cpos;
mlog_entry_void();
BUG_ON(!last_eb_bh || !*last_eb_bh);
if (eb_bh) {
eb = (struct ocfs2_extent_block *) eb_bh->b_data;
el = &eb->h_list;
} else
el = et->root_el;
/* we never add a branch to a leaf. */
BUG_ON(!el->l_tree_depth);
new_blocks = le16_to_cpu(el->l_tree_depth);
/* allocate the number of new eb blocks we need */
new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
GFP_KERNEL);
if (!new_eb_bhs) {
status = -ENOMEM;
mlog_errno(status);
goto bail;
}
status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
meta_ac, new_eb_bhs);
if (status < 0) {
mlog_errno(status);
goto bail;
}
eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
* linked with the rest of the tree.
* conversly, new_eb_bhs[0] is the new bottommost leaf.
*
* when we leave the loop, new_last_eb_blk will point to the
* newest leaf, and next_blkno will point to the topmost extent
* block. */
next_blkno = new_last_eb_blk = 0;
for(i = 0; i < new_blocks; i++) {
bh = new_eb_bhs[i];
eb = (struct ocfs2_extent_block *) bh->b_data;
if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
status = -EIO;
goto bail;
}
eb_el = &eb->h_list;
status = ocfs2_journal_access(handle, inode, bh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
eb->h_next_leaf_blk = 0;
eb_el->l_tree_depth = cpu_to_le16(i);
eb_el->l_next_free_rec = cpu_to_le16(1);
/*
* This actually counts as an empty extent as
* c_clusters == 0
*/
eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
/*
* eb_el isn't always an interior node, but even leaf
* nodes want a zero'd flags and reserved field so
* this gets the whole 32 bits regardless of use.
*/
eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
if (!eb_el->l_tree_depth)
new_last_eb_blk = le64_to_cpu(eb->h_blkno);
status = ocfs2_journal_dirty(handle, bh);
if (status < 0) {
mlog_errno(status);
goto bail;
}
next_blkno = le64_to_cpu(eb->h_blkno);
}
/* This is a bit hairy. We want to update up to three blocks
|