aboutsummaryrefslogblamecommitdiffstats
path: root/Documentation/prio_tree.txt
blob: 2fbb0c49bc5b6d8e4389dd9daa7fb4bbf59a6d17 (plain) (tree)










































































































                                                                                                                      
The prio_tree.c code indexes vmas using 3 different indexes:
	* heap_index  = vm_pgoff + vm_size_in_pages : end_vm_pgoff
	* radix_index = vm_pgoff : start_vm_pgoff
	* size_index = vm_size_in_pages

A regular radix-priority-search-tree indexes vmas using only heap_index and
radix_index. The conditions for indexing are:
	* ->heap_index >= ->left->heap_index &&
		->heap_index >= ->right->heap_index
	* if (->heap_index == ->left->heap_index)
		then ->radix_index < ->left->radix_index;
	* if (->heap_index == ->right->heap_index)
		then ->radix_index < ->right->radix_index;
	* nodes are hashed to left or right subtree using radix_index
	  similar to a pure binary radix tree.

A regular radix-priority-search-tree helps to store and query
intervals (vmas). However, a regular radix-priority-search-tree is only
suitable for storing vmas with different radix indices (vm_pgoff).

Therefore, the prio_tree.c extends the regular radix-priority-search-tree
to handle many vmas with the same vm_pgoff. Such vmas are handled in
2 different ways: 1) All vmas with the same radix _and_ heap indices are
linked using vm_set.list, 2) if there are many vmas with the same radix
index, but different heap indices and if the regular radix-priority-search
tree cannot index them all, we build an overflow-sub-tree that indexes such
vmas using heap and size indices instead of heap and radix indices. For
example, in the figure below some vmas with vm_pgoff = 0 (zero) are
indexed by regular radix-priority-search-tree whereas others are pushed
into an overflow-subtree. Note that all vmas in an overflow-sub-tree have
the same vm_pgoff (radix_index) and if necessary we build different
overflow-sub-trees to handle each possible radix_index. For example,
in figure we have 3 overflow-sub-trees corresponding to radix indices
0, 2, and 4.

In the final tree the first few (prio_tree_root->index_bits) levels
are indexed using heap and radix indices whereas the overflow-sub-trees below
those levels (i.e. levels prio_tree_root->index_bits + 1 and higher) are
indexed using heap and size indices. In overflow-sub-trees the size_index
is used for hashing the nodes to appropriate places.

Now, an example prio_tree:

  vmas are represented [radix_index, size_index, heap_index]
                 i.e., [start_vm_pgoff, vm_size_in_pages, end_vm_pgoff]

level  prio_tree_root->index_bits = 3
-----
												_
  0			 				[0,7,7]					 |
  							/     \					 |
				      ------------------       ------------			 |     Regular
  				     /					   \			 |  radix priority
  1		 		[1,6,7]					  [4,3,7]		 |   search tree
  				/     \					  /     \		 |
			 -------       -----			    ------       -----		 |  heap-and-radix
			/		    \			   /		      \		 |      indexed
  2		    [0,6,6]	 	   [2,5,7]		[5,2,7]		    [6,1,7]	 |
		    /     \		   /     \		/     \		    /     \	 |
  3		[0,5,5]	[1,5,6]		[2,4,6]	[3,4,7]	    [4,2,6] [5,1,6]	[6,0,6]	[7,0,7]	 |
		   /			   /		       /		   		_
                  /		          /		      /					_
  4	      [0,4,4]		      [2,3,5]		   [4,1,5]				 |
  		 /			 /		      /					 |
  5	     [0,3,3]		     [2,2,4]		  [4,0,4]				 |  Overflow-sub-trees
  		/			/							 |
  6	    [0,2,2]		    [2,1,3]							 |    heap-and-size
  	       /		       /							 |       indexed
  7	   [0,1,1]		   [2,0,2]							 |
  	      /											 |
  8	  [0,0,0]										 |
  												_

Note that we use prio_tree_root->index_bits to optimize the height
of the heap-and-radix indexed tree. Since prio_tree_root->index_bits is
set according to the maximum end_vm_pgoff mapped, we are sure that all
bits (in vm_pgoff) above prio_tree_root->index_bits are 0 (zero). Therefore,
we only use the first prio_tree_root->index_bits as radix_index.
Whenever index_bits is increased in prio_tree_expand, we shuffle the tree
to make sure that the first prio_tree_root->index_bits levels of the tree
is indexed properly using heap and radix indices.

We do not optimize the height of overflow-sub-trees using index_bits.
The reason is: there can be many such overflow-sub-trees and all of
them have to be suffled whenever the index_bits increases. This may involve
walking the whole prio_tree in prio_tree_insert->prio_tree_expand code
path which is not desirable. Hence, we do not optimize the height of the
heap-and-size indexed overflow-sub-trees using prio_tree->index_bits.
Instead the overflow sub-trees are indexed using full BITS_PER_LONG bits
of size_index. This may lead to skewed sub-trees because most of the
higher significant bits of the size_index are likely to be be 0 (zero). In
the example above, all 3 overflow-sub-trees are skewed. This may marginally
affect the performance. However, processes rarely map many vmas with the
same start_vm_pgoff but different end_vm_pgoffs. Therefore, we normally
do not require overflow-sub-trees to index all vmas.

From the above discussion it is clear that the maximum height of
a prio_tree can be prio_tree_root->index_bits + BITS_PER_LONG.
However, in most of the common cases we do not need overflow-sub-trees,
so the tree height in the common cases will be prio_tree_root->index_bits.

It is fair to mention here that the prio_tree_root->index_bits
is increased on demand, however, the index_bits is not decreased when
vmas are removed from the prio_tree. That's tricky to do. Hence, it's
left as a home work problem.


lude <linux/qnx4_fs.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/stat.h> #include <linux/fcntl.h> #include <linux/errno.h> #include <linux/smp_lock.h> #include <linux/buffer_head.h> /* * check if the filename is correct. For some obscure reason, qnx writes a * new file twice in the directory entry, first with all possible options at 0 * and for a second time the way it is, they want us not to access the qnx * filesystem when whe are using linux. */ static int qnx4_match(int len, const char *name, struct buffer_head *bh, unsigned long *offset) { struct qnx4_inode_entry *de; int namelen, thislen; if (bh == NULL) { printk("qnx4: matching unassigned buffer !\n"); return 0; } de = (struct qnx4_inode_entry *) (bh->b_data + *offset); *offset += QNX4_DIR_ENTRY_SIZE; if ((de->di_status & QNX4_FILE_LINK) != 0) { namelen = QNX4_NAME_MAX; } else { namelen = QNX4_SHORT_NAME_MAX; } /* "" means "." ---> so paths like "/usr/lib//libc.a" work */ if (!len && (de->di_fname[0] == '.') && (de->di_fname[1] == '\0')) { return 1; } thislen = strlen( de->di_fname ); if ( thislen > namelen ) thislen = namelen; if (len != thislen) { return 0; } if (strncmp(name, de->di_fname, len) == 0) { if ((de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK)) != 0) { return 1; } } return 0; } static struct buffer_head *qnx4_find_entry(int len, struct inode *dir, const char *name, struct qnx4_inode_entry **res_dir, int *ino) { unsigned long block, offset, blkofs; struct buffer_head *bh; *res_dir = NULL; if (!dir->i_sb) { printk("qnx4: no superblock on dir.\n"); return NULL; } bh = NULL; block = offset = blkofs = 0; while (blkofs * QNX4_BLOCK_SIZE + offset < dir->i_size) { if (!bh) { bh = qnx4_bread(dir, blkofs, 0); if (!bh) { blkofs++; continue; } } *res_dir = (struct qnx4_inode_entry *) (bh->b_data + offset); if (qnx4_match(len, name, bh, &offset)) { block = qnx4_block_map( dir, blkofs ); *ino = block * QNX4_INODES_PER_BLOCK + (offset / QNX4_DIR_ENTRY_SIZE) - 1; return bh; } if (offset < bh->b_size) { continue; } brelse(bh); bh = NULL; offset = 0; blkofs++; } brelse(bh); *res_dir = NULL; return NULL; } struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { int ino; struct qnx4_inode_entry *de; struct qnx4_link_info *lnk; struct buffer_head *bh; const char *name = dentry->d_name.name; int len = dentry->d_name.len; struct inode *foundinode = NULL; lock_kernel(); if (!(bh = qnx4_find_entry(len, dir, name, &de, &ino))) goto out; /* The entry is linked, let's get the real info */ if ((de->di_status & QNX4_FILE_LINK) == QNX4_FILE_LINK) { lnk = (struct qnx4_link_info *) de; ino = (le32_to_cpu(lnk->dl_inode_blk) - 1) * QNX4_INODES_PER_BLOCK + lnk->dl_inode_ndx; } brelse(bh); if ((foundinode = iget(dir->i_sb, ino)) == NULL) { unlock_kernel(); QNX4DEBUG(("qnx4: lookup->iget -> NULL\n")); return ERR_PTR(-EACCES); } out: unlock_kernel(); d_add(dentry, foundinode); return NULL; } #ifdef CONFIG_QNX4FS_RW int qnx4_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) { QNX4DEBUG(("qnx4: qnx4_create\n")); if (dir == NULL) { return -ENOENT; } return -ENOSPC; } int qnx4_rmdir(struct inode *dir, struct dentry *dentry) { struct buffer_head *bh; struct qnx4_inode_entry *de; struct inode *inode; int retval; int ino; QNX4DEBUG(("qnx4: qnx4_rmdir [%s]\n", dentry->d_name.name)); lock_kernel(); bh = qnx4_find_entry(dentry->d_name.len, dir, dentry->d_name.name, &de, &ino); if (bh == NULL) { unlock_kernel(); return -ENOENT; } inode = dentry->d_inode; if (inode->i_ino != ino) { retval = -EIO; goto end_rmdir; } #if 0 if (!empty_dir(inode)) { retval = -ENOTEMPTY; goto end_rmdir; } #endif if (inode->i_nlink != 2) { QNX4DEBUG(("empty directory has nlink!=2 (%d)\n", inode->i_nlink)); } QNX4DEBUG(("qnx4: deleting directory\n")); de->di_status = 0; memset(de->di_fname, 0, sizeof de->di_fname); de->di_mode = 0; mark_buffer_dirty(bh); inode->i_nlink = 0; mark_inode_dirty(inode); inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; dir->i_nlink--; mark_inode_dirty(dir); retval = 0; end_rmdir: brelse(bh); unlock_kernel(); return retval; } int qnx4_unlink(struct inode *dir, struct dentry *dentry) { struct buffer_head *bh; struct qnx4_inode_entry *de; struct inode *inode; int retval; int ino; QNX4DEBUG(("qnx4: qnx4_unlink [%s]\n", dentry->d_name.name)); lock_kernel(); bh = qnx4_find_entry(dentry->d_name.len, dir, dentry->d_name.name, &de, &ino); if (bh == NULL) { unlock_kernel(); return -ENOENT; } inode = dentry->d_inode; if (inode->i_ino != ino) { retval = -EIO; goto end_unlink; } retval = -EPERM; if (!inode->i_nlink) { QNX4DEBUG(("Deleting nonexistent file (%s:%lu), %d\n", inode->i_sb->s_id, inode->i_ino, inode->i_nlink)); inode->i_nlink = 1; } de->di_status = 0; memset(de->di_fname, 0, sizeof de->di_fname); de->di_mode = 0; mark_buffer_dirty(bh); dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; mark_inode_dirty(dir); inode->i_nlink--; inode->i_ctime = dir->i_ctime; mark_inode_dirty(inode); retval = 0; end_unlink: unlock_kernel(); brelse(bh); return retval; } #endif