从Trie树谈到后缀树之后缀树的C++简单实现_数据分析师
本程序只是简单实现了后缀树的构造,查询等基本的操作。如果此文能引出读者更好的suffixtree 的实现,此文的价值便更甚了。谢谢。
suffix-tree的c++实现
完整源码如下,由于注释已经够详细,便不再多说什么了:
-
// suffer tree.cpp : 定义控制台应用程序的入口点。
-
//copyright@ 2011 July && 匿名
-
-
//1、参考了他人的程序,但不知道原作者是谁。
-
//如果你知道,请立马联系我,一定把原作者的大名署上,谢谢。
-
//2、主函数没有写全了,读者视需要自己补上吧。
-
-
#include "stdafx.h"
-
#ifndef __SUFFIXTREE_H
-
#define __SUFFIXTREE_H
-
#include
-
#include
-
#include
-
usingnamespacestd;
-
usingstd::list;
-
usingstd::vector;
-
-
//对于叶节点,通过m_nActiveLen和m_nEdgeLabelEndPos可以得到节点对应的字符串.对于分支节点,也是如此.
-
structtagSuffixNode
-
{
-
public:
-
tagSuffixNode* m_tpChild;//子节点,如果为0,说明是叶子节点
-
tagSuffixNode* m_tpParent;//父节点,根节点的父节点为0
-
tagSuffixNode* m_tpLeftSibling;//左兄弟
-
tagSuffixNode* m_tpRightSibling;//右兄弟
-
tagSuffixNode* m_tpMostRightChild;//最右子节点
-
tagSuffixNode* m_tpSuffixLink;//指向当前节点表示的子串的最大后缀所对应的分支节点
-
-
intm_nActiveLen;//节点代表的字符串的长度.
-
intm_nEdgeLabelStartPos;//入边开始字符的下标.对于分支节点,只是用来表示边对应的字符串是什么,并不表示其在子孙叶节点表示的串中对应位置的字符在整个串中的下标.
-
intm_nEdgeLabelEndPos;//入边结束字符的下标
-
boolm_bIsLeaf;//是否是叶节点
-
-
charm_cIndex;//test
-
-
public:
-
//全部初始化为0
-
voidinit()
-
{
-
m_tpChild = 0;
-
m_tpParent = 0;
-
m_tpLeftSibling = 0;
-
m_tpRightSibling = 0;
-
m_tpMostRightChild = 0;
-
m_tpSuffixLink = 0;
-
-
m_nEdgeLabelStartPos = 0;
-
m_nEdgeLabelEndPos = 0;
-
-
m_nActiveLen = 0;
-
m_bIsLeaf =true;
-
}
-
};
-
-
classCSuffixTree
-
{
-
public:
-
CSuffixTree(char*str);
-
~CSuffixTree();
-
voiddeleteTree();
-
voidconstruct();
-
voidreset(char*str);
-
intsearch(char*str);
-
tagSuffixNode* getTree();
-
voidprintTree();
-
voidtest();
-
private:
-
tagSuffixNode* __allocNode();
-
tagSuffixNode* __findChildBeginWithChar(tagSuffixNode* node,charc);
-
-
void__printHelper(tagSuffixNode* node,intlevel);
-
private:
-
intm_nActiveLen;
-
intm_nStrLen;//字符串长度
-
tagSuffixNode* m_tpRoot;//根节点
-
tagSuffixNode* m_tpActiveNode;//活动节点
-
char*m_cpInternalStr;//字符串内存存储
-
-
list m_tagNodeList;//保存分配的节点的指针
-
vector m_tagFromNodeVec;//保存还未确定后缀指针的分支节点的指针,用于判定suffix link
-
vector m_tagToNodeVec;//保存所有分支节点
-
#ifdef DEBUG
-
charm_cIndex;
-
#endif
-
};
-
-
#endif //
-
-
//#include "suffixtree.h"
-
#include
-
#include
-
#include
-
#include
-
#include
-
usingnamespacestd;
-
-
CSuffixTree::CSuffixTree(char*str)
-
:m_nActiveLen(0),
-
m_tpRoot(0),
-
m_tpActiveNode(0)
-
{
-
m_nStrLen = strlen(str) + 1;
-
m_cpInternalStr =newchar[m_nStrLen+1];
-
-
sprintf(m_cpInternalStr,"%s#", str);
-
#ifdef DEBUG
-
m_cIndex ='A';
-
#endif
-
}
-
-
CSuffixTree::~CSuffixTree()
-
{
-
deleteTree();
-
delete[] m_cpInternalStr;
-
}
-
-
voidCSuffixTree::deleteTree()
-
{
-
list::iterator iter = m_tagNodeList.begin();
-
for(; iter != m_tagNodeList.end(); ++iter)
-
{
-
delete*iter;
-
}
-
-
m_tagNodeList.clear();
-
}
-
-
voidCSuffixTree::reset(char*str)
-
{
-
deleteTree();
-
-
inttmp = strlen(str);
-
if(tmp+1 > m_nStrLen)
-
{
-
m_nStrLen = tmp+1;
-
delete[] m_cpInternalStr;
-
m_cpInternalStr =newchar[m_nStrLen+1];
-
}
-
else
-
{
-
m_nStrLen = tmp+1;
-
}
-
sprintf(m_cpInternalStr,"%s#", str);
-
-
m_nActiveLen = 0;
-
m_nStrLen = 0;
-
m_tpActiveNode = 0;
-
m_tpRoot = 0;
-
-
m_tagFromNodeVec.clear();
-
//reconstruct tree for new string
-
construct();
-
}
-
-
-
//suffixTree之构造
-
voidCSuffixTree::construct()
-
{
-
m_tpRoot = __allocNode();
-
if(m_tpRoot == 0)
-
{
-
#ifdef DEBUG
-
printf("__allocNode error!\n");
-
#endif
-
return;
-
}
-
m_nActiveLen = 0;
-
m_tpRoot->m_nActiveLen = 0;
-
//initial active node
-
m_tpActiveNode = m_tpRoot;
-
-
m_tagToNodeVec.push_back(m_tpRoot);
-
-
for(inti = 0; i
-
{
-
#ifdef DEBUG
-
printf("\ninsert %s\n", &(m_cpInternalStr[i]));
-
printf("active node:[%c]\n", m_tpActiveNode->m_cIndex);
-
#endif
-
//判断是否有后继节点
-
if(m_tpActiveNode->m_tpSuffixLink != 0)
-
{
-
m_tpActiveNode = m_tpActiveNode->m_tpSuffixLink;
-
m_nActiveLen--;
-
}
-
intpos = i;
-
-
#ifdef DEBUG
-
printf("new active node:[%c]\n", m_tpActiveNode->m_cIndex);
-
printf("pos:%d\n", pos);
-
printf("active length:%d\n", m_nActiveLen);
-
#endif
-
-
while(true)
-
{
-
//查找以当前后缀串的开始字符开始的子节点
-
tagSuffixNode* node = __findChildBeginWithChar(m_tpActiveNode, m_cpInternalStr[i+m_nActiveLen]);
-
//未找到以suffix[i]的开始字符开头的节点
-
if(node == 0)
-
{
-
#ifdef DEBUG
-
printf("__findChildBeginWithChar null\n");
-
#endif
-
tagSuffixNode* child = __allocNode();
-
-
//设置开始、结束位置的下标
-
child->m_nEdgeLabelStartPos = pos;
-
child->m_nEdgeLabelEndPos = m_nStrLen-1;
-
//设置叶节点代表的后缀串的长度
-
child->m_nActiveLen = m_nStrLen - i;
-
//设置父节点
-
child->m_tpParent = m_tpActiveNode;
-
if( m_tpActiveNode->m_tpChild == 0)
-
{
-
m_tpActiveNode->m_tpChild = child;
-
m_tpActiveNode->m_tpMostRightChild = child;
-
}
-
else
-
{
-
m_tpActiveNode->m_tpMostRightChild->m_tpRightSibling = child;
-
child->m_tpLeftSibling = m_tpActiveNode->m_tpMostRightChild;
-
m_tpActiveNode->m_tpMostRightChild = child;
-
}
-
-
break;
-
}
-
else
-
{
-
#ifdef DEBUG
-
printf("__findChildBeginWithChar ok\n");
-
printf("node start:%d\n", node->m_nEdgeLabelStartPos);
-
printf("node end:%d\n", node->m_nEdgeLabelEndPos);
-
printf("node index:%c\n", node->m_cIndex);
-
#endif
-
//TODO
-
//确定 m_nMinDistance
-
intedgeStart = node->m_nEdgeLabelStartPos;
-
intstrStart = i + m_nActiveLen;
-
boolsplit =false;
-
-
for(; edgeStart<=node->m_nEdgeLabelEndPos; edgeStart++, strStart++)
-
{
-
if( m_cpInternalStr[edgeStart] != m_cpInternalStr[strStart])
-
{
-
split =true;
-
break;
-
}
-
}
-
if(!split)
-
{
-
m_tpActiveNode = node;
-
m_nActiveLen += node->m_nEdgeLabelEndPos - node->m_nEdgeLabelStartPos + 1;
-
pos += node->m_nEdgeLabelEndPos - node->m_nEdgeLabelStartPos + 1;
-
continue;
-
}
-
else
-
{
-
tagSuffixNode* parent = node->m_tpParent;
-
//new branch node
-
tagSuffixNode* branch = __allocNode();
-
branch->m_bIsLeaf =false;
-
branch->m_nEdgeLabelStartPos = node->m_nEdgeLabelStartPos;
-
branch->m_nEdgeLabelEndPos = edgeStart-1;
-
branch->m_nActiveLen = parent->m_nActiveLen + (branch->m_nEdgeLabelEndPos - branch->m_nEdgeLabelStartPos + 1);
-
//original node
-
node->m_nEdgeLabelStartPos = edgeStart;
-
-
tagSuffixNode* info = __allocNode();
-
info->m_nEdgeLabelStartPos = strStart;
-
info->m_nEdgeLabelEndPos = m_nStrLen - 1;
-
info->m_nActiveLen = branch->m_nActiveLen + (info->m_nEdgeLabelEndPos - info->m_nEdgeLabelStartPos + 1);
-
-
branch->m_tpParent = parent;
-
branch->m_tpRightSibling = parent->m_tpChild;
-
parent->m_tpChild->m_tpLeftSibling = branch;
-
parent->m_tpChild = branch;
-
-
node->m_tpLeftSibling->m_tpRightSibling = node->m_tpRightSibling;
-
if( node->m_tpRightSibling != 0)
-
{
-
node->m_tpRightSibling->m_tpLeftSibling = node->m_tpLeftSibling;
-
}
-
else
-
{
-
parent->m_tpMostRightChild = node->m_tpLeftSibling;
-
}
-
-
branch->m_tpChild = info;
-
branch->m_tpMostRightChild = node;
-
-
info->m_tpRightSibling = node;
-
info->m_tpParent = branch;
-
-
node->m_tpParent = branch;
-
node->m_tpLeftSibling = info;
-
node->m_tpRightSibling = 0;
-
-
-
//设置节点的suffix link
-
-
boolsetSuffix =false;
-
vector::iterator iter = m_tagToNodeVec.begin();
-
for(; iter != m_tagToNodeVec.end(); ++iter)
-
{
-
tagSuffixNode* tmp = *iter;
-
if( strncmp( &(m_cpInternalStr[branch->m_nEdgeLabelEndPos - branch->m_nActiveLen + 2]), &(m_cpInternalStr[tmp->m_nEdgeLabelEndPos - tmp->m_nActiveLen + 1]), branch->m_nActiveLen - 1) == 0)
-
{
-
branch->m_tpSuffixLink = tmp;
-
setSuffix =true;
-
break;
-
#ifdef DEBUG
-
printf("branch[%c] suffix_link is branch[%c]\n", tmp->m_cIndex, branch->m_cIndex);
-
#endif
-
}
-
}
-
m_tagToNodeVec.push_back(branch);
-
-
vector::iterator iter2 = m_tagFromNodeVec.begin();
-
for(; iter2 != m_tagFromNodeVec.end(); ++iter2)
-
{
-
tagSuffixNode* tmp = *iter2;
-
//找到后缀节点,结束循环
-
if( strncmp( &(m_cpInternalStr[tmp->m_nEdgeLabelEndPos - tmp->m_nActiveLen + 2]), &(m_cpInternalStr[branch->m_nEdgeLabelEndPos - branch->m_nActiveLen + 1]), tmp->m_nActiveLen - 1) == 0)
-
{
-
tmp->m_tpSuffixLink = branch;
-
m_tagFromNodeVec.erase(iter2);
-
#ifdef DEBUG
-
printf("branch[%c] suffix_link is branch[%c]\n", tmp->m_cIndex, branch->m_cIndex);
-
#endif
-
break;
-
}
-
}
-
-
if( !setSuffix )
-
{
-
m_tagFromNodeVec.push_back(branch);
-
}
-
-
//已经将新的后缀字符串插入到树中,这时可以结束while了。
-
break;
-
}
-
}
-
}
-
//test
-
#ifdef DEBUG
-
printf("print\n");
-
printTree();
-
#endif
-
}
-
}
-
-
-
//查询
-
intCSuffixTree::search(char*str)
-
{
-
if(str == 0)
-
return-1;
-
//TODO
-
//添加处理过程
-
-
intlen = strlen(str);
-
-
tagSuffixNode* node = 0;
-
tagSuffixNode* cur = m_tpRoot;
-
for(inti=0; i
-
{
-
node = __findChildBeginWithChar(cur, str[i]);
-
if(node == 0)
-
{
-
break;
-
}
-
else
-
{
-
intedgeLen = node->m_nEdgeLabelEndPos - node->m_nEdgeLabelStartPos + 1;
-
intremain = len - i;
-
if( remain <= edgeLen )
-
{
-
if( strncmp(&(str[i]), &(m_cpInternalStr[node->m_nEdgeLabelStartPos]), remain) != 0)
-
{
-
return-1;
-
}
-
else
-
{
-
returnnode->m_nEdgeLabelEndPos - node->m_nActiveLen + 1;
-
}
-
}
-
else
-
{
-
if( strncmp(&(str[i]), &(m_cpInternalStr[node->m_nEdgeLabelStartPos]), edgeLen) != 0)
-
{
-
return-1;
-
}
-
else
-
{
-
i += edgeLen;
-
cur = node;
-
}
-
}
-
}
-
}
-
return-1;
-
}
-
-
tagSuffixNode* CSuffixTree::__allocNode()
-
{
-
tagSuffixNode* node =newtagSuffixNode;
-
-
m_tagNodeList.push_back(node);
-
-
node->init();
-
#ifdef DEBUG
-
node->m_cIndex = m_cIndex;
-
m_cIndex++;
-
#endif
-
returnnode;
-
}
-
-
tagSuffixNode* CSuffixTree::__findChildBeginWithChar(tagSuffixNode* node,charc)
-
{
-
assert(node != 0);
-
-
tagSuffixNode* child = node->m_tpChild;
-
while(child != 0)
-
{
-
if( m_cpInternalStr[child->m_nEdgeLabelStartPos] == c)
-
returnchild;
-
-
child = child->m_tpRightSibling;
-
}
-
-
return0;
-
}
-
-
voidCSuffixTree::test()
-
{
-
//printf("%s\n", m_cpInternalStr);
-
printTree();
-
}
-
-
voidCSuffixTree::printTree()
-
{
-
#ifdef DEBUG
-
printf("[A]\n");
-
#endif
-
__printHelper(m_tpRoot, 0);
-
}
-
-
voidCSuffixTree::__printHelper(tagSuffixNode* node,intlevel)
-
{
-
intstart = node->m_nEdgeLabelStartPos;
-
intend = node->m_nEdgeLabelEndPos;
-
tagSuffixNode* child = node->m_tpChild;
-
if( level > 0 )
-
{
-
for(inti=0; i
-
{
-
printf(" |");
-
}
-
printf(" + ");
-
for(intj = start; j <= end; j++)
-
{
-
printf("%c", m_cpInternalStr[j]);
-
}
-
#ifdef DEBUG
-
printf("[%c]", node->m_cIndex);
-
// printf("[%d:%d:%d]", node->m_nEdgeLabelStartPos, node->m_nEdgeLabelEndPos, node->m_nActiveLen);
-
#endif
-
printf("\n");
-
}
-
-
while( child != 0 )
-
{
-
__printHelper(child, level+1);
-
child = child->m_tpRightSibling;
-
}
-
}
-
-
intmain()
-
{
-
cout<<"hello suffixtree"<
-
system("pause");
-
return0;
-
}
运行结果如下:
Shlomo Yona的实现
下面是国外一牛人Shlomo Yona写的实现(摘取了部分非完整代码):
-
/******************************************************************************
-
Suffix Tree Version 2.1
-
-
AUTHORS
-
-
Dotan Tsadok
-
Instructor: Mr. Shlomo Yona, University of Haifa, Israel. December 2002.
-
Current maintainer: Shlomo Yona
-
-
COPYRIGHT
-
-
Copyright 2002-2003 Shlomo Yona
-
-
LICENSE
-
-
This library is free software; you can redistribute it and/or modify it
-
under the same terms as Perl itself.
-
-
-
DESCRIPTION OF THIS FILE:
-
-
This is the implementation file suffix_tree.c implementing the header file
-
suffix_tree.h.
-
-
This code is an Open Source implementation of Ukkonen's algorithm for
-
constructing a suffix tree over a string in time and space complexity
-
O(length of the string). The code is written under strict ANSI C.
-
-
For a complete understanding of the code see Ukkonen's algorithm and the
-
readme.txt file.
-
-
The general pseudo code is:
-
-
n = length of the string.
-
ST_CreateTree:
-
Calls n times to SPA (Single Phase Algorithm). SPA:
-
Increase the variable e (virtual end of all leaves).
-
Calls SEA (Single Extension Algorithm) starting with the first extension that
-
does not already exist in the tree and ending at the first extension that
-
already exists. SEA :
-
Follow suffix link.
-
Check if current suffix exists in the tree.
-
If it does not - apply rule 2 and then create a new suffix link.
-
apply_rule_2:
-
Create a new leaf and maybe a new internal node as well.
-
create_node:
-
Create a new node or a leaf.
-
-
-
For implementation interpretations see Basic Ideas paragraph in the Developement
-
section of the readme.txt file.
-
-
An example of the implementations of a node and its sons using linked lists
-
instead of arrays:
-
-
(1)
-
|
-
|
-
|
-
(2)--(3)--(4)--(5)
-
-
(2) is the only son of (1) (call it the first son). Other sons of (1) are
-
connected using a linked lists starting from (2) and going to the right. (3) is
-
the right sibling of (2) (and (2) is the left sibling of (3)), (4) is the right
-
sibling of (3), etc.
-
The father field of all (2), (3), (4) and (5) points to (1), but the son field
-
of (1) points only to (2).
-
-
*******************************************************************************/
-
-
#include "stdlib.h"
-
#include "stdio.h"
-
#include "string.h"
-
#include "suffix_tree.h"
-
-
/* See function body */
-
voidST_PrintTree(SUFFIX_TREE* tree);
-
/* See function body */
-
voidST_PrintFullNode(SUFFIX_TREE* tree, NODE* node);
-
-
/* Used in function trace_string for skipping (Ukkonen's Skip Trick). */
-
typedefenumSKIP_TYPE {skip, no_skip} SKIP_TYPE;
-
/* Used in function apply_rule_2 - two types of rule 2 - see function for more
-
details.*/
-
typedefenumRULE_2_TYPE {new_son, split} RULE_2_TYPE;
-
/* Signals whether last matching position is the last one of the current edge */
-
typedefenumLAST_POS_TYPE {last_char_in_edge, other_char} LAST_POS_TYPE;
-
-
/* Used for statistic measures of speed. */
-
DBL_WORD counter;
-
/* Used for statistic measures of space. */
-
DBL_WORD heap;
-
/* Used to mark the node that has no suffix link yet. By Ukkonen, it will have
-
one by the end of the current phase. */
-
NODE* suffixless;
-
-
typedefstructSUFFIXTREEPATH
-
{
-
DBL_WORD begin;
-
DBL_WORD end;
-
} PATH;
-
-
typedefstructSUFFIXTREEPOS
-
{
-
NODE* node;
-
DBL_WORD edge_pos;
-
}POS;
-
-
/******************************************************************************/
-
/*
-
Define STATISTICS in order to view measures of speed and space while
-
constructing and searching the suffix tree. Measures will be printed on the
-
screen.
-
*/
-
/* #define STATISTICS */
-
-
/*
-
Define DEBUG in order to view debug printouts to the screen while
-
constructing and searching the suffix tree.
-
*/
-
/* #define DEBUG */
-
-
/******************************************************************************/
-
/*
-
create_node :
-
Creates a node with the given init field-values.
-
-
Input : The father of the node, the starting and ending indices
-
of the incloming edge to that node,
-
the path starting position of the node.
-
-
Output: A pointer to that node.
-
*/
-
-
-
NODE* create_node(NODE* father, DBL_WORD start, DBL_WORD end, DBL_WORD position)
-
{
-
/*Allocate a node.*/
-
NODE* node = (NODE*)malloc(sizeof(NODE));
-
if(node == 0)
-
{
-
printf("\nOut of memory.\n");
-
exit(0);
-
}
-
-
#ifdef STATISTICS
-
heap+=sizeof(NODE);
-
#endif
-
-
/* Initialize node fields. For detailed description of the fields see
-
suffix_tree.h */
-
node->sons = 0;
-
node->right_sibling = 0;
-
node->left_sibling = 0;
-
node->suffix_link = 0;
-
node->father = father;
-
node->path_position = position;
-
node->edge_label_start = start;
-
node->edge_label_end = end;
-
returnnode;
-
}
-
-
/******************************************************************************/
-
/*
-
find_son :
-
Finds son of node that starts with a certain character.
-
-
Input : the tree, the node to start searching from and the character to be
-
searched in the sons.
-
-
Output: A pointer to the found son, 0 if no such son.
-
*/
-
-
NODE* find_son(SUFFIX_TREE* tree, NODE* node,charcharacter)
-
{
-
/* Point to the first son. */
-
node = node->sons;
-
/* scan all sons (all right siblings of the first son) for their first
-
character (it has to match the character given as input to this function. */
-
while(node != 0 && tree->tree_string[node->edge_label_start] != character)
-
{
-
#ifdef STATISTICS
-
counter++;
-
#endif
-
node = node->right_sibling;
-
}
-
returnnode;
-
}
-
-
/******************************************************************************/
-
/*
-
get_node_label_end :
-
Returns the end index of the incoming edge to that node. This function is
-
needed because for leaves the end index is not relevant, instead we must look
-
at the variable "e" (the global virtual end of all leaves). Never refer
-
directly to a leaf's end-index.
-
-
Input : the tree, the node its end index we need.
-
-
Output: The end index of that node (meaning the end index of the node's
-
incoming edge).
-
*/
-
-
DBL_WORD get_node_label_end(SUFFIX_TREE* tree, NODE* node)
-
{
-
/* If it's a leaf - return e */
-
if(node->sons == 0)
-
returntree->e;
-
/* If it's not a leaf - return its real end */
-
returnnode->edge_label_end;
-
}
-
-
/******************************************************************************/
-
/*
-
get_node_label_length :
-
Returns the length of the incoming edge to that node. Uses get_node_label_end
-
(see above).
-
-
Input : The tree and the node its length we need.
-
-
Output: the length of that node.
-
*/
-
-
DBL_WORD get_node_label_length(SUFFIX_TREE* tree, NODE* node)
-
{
-
/* Calculate and return the lentgh of the node */
-
returnget_node_label_end(tree, node) - node->edge_label_start + 1;
-
}
-
-
/******************************************************************************/
-
/*
-
is_last_char_in_edge :
-
Returns 1 if edge_pos is the last position in node's incoming edge.
-
-
Input : The tree, the node to be checked and the position in its incoming
-
edge.
-
-
Output: the length of that node.
-
*/
-
-
charis_last_char_in_edge(SUFFIX_TREE* tree, NODE* node, DBL_WORD edge_pos)
-
{
-
if(edge_pos == get_node_label_length(tree,node)-1)
-
return1;
-
return0;
-
}
-
-
/******************************************************************************/
-
/*
-
connect_siblings :
-
Connect right_sib as the right sibling of left_sib and vice versa.
-
-
Input : The two nodes to be connected.
-
-
Output: None.
-
*/
-
-
voidconnect_siblings(NODE* left_sib, NODE* right_sib)
-
{
-
/* Connect the right node as the right sibling of the left node */
-
if(left_sib != 0)
-
left_sib->right_sibling = right_sib;
-
/* Connect the left node as the left sibling of the right node */
-
if(right_sib != 0)
-
right_sib->left_sibling = left_sib;
-
}
-
-
/******************************************************************************/
-
/*
-
apply_extension_rule_2 :
-
Apply "extension rule 2" in 2 cases:
-
1. A new son (leaf 4) is added to a node that already has sons:
-
(1) (1)
-
/ \ -> / | \
-
(2) (3) (2)(3)(4)
-
-
2. An edge is split and a new leaf (2) and an internal node (3) are added:
-
| |
-
| (3)
-
| -> / \
-
(1) (1) (2)
-
-
Input : See parameters.
-
-
Output: A pointer to the newly created leaf (new_son case) or internal node
-
(split case).
-
*/
-
-
NODE* apply_extension_rule_2(
-
/* Node 1 (see drawings) */
-
NODE* node,
-
/* Start index of node 2's incoming edge */
-
DBL_WORD edge_label_begin,
-
/* End index of node 2's incoming edge */
-
DBL_WORD edge_label_end,
-
/* Path start index of node 2 */
-
DBL_WORD path_pos,
-
/* Position in node 1's incoming edge where split is to be
-
performed */
-
DBL_WORD edge_pos,
-
/* Can be 'new_son' or 'split' */
-
RULE_2_TYPE type)
-
{
-
NODE *new_leaf,
-
*new_internal,
-
*son;
-
/*-------new_son-------*/
-
if(type == new_son)
-
{
-
#ifdef DEBUG
-
printf("rule 2: new leaf (%lu,%lu)\n",edge_label_begin,edge_label_end);
-
#endif
-
/* Create a new leaf (4) with the characters of the extension */
-
new_leaf = create_node(node, edge_label_begin , edge_label_end, path_pos);
-
/* Connect new_leaf (4) as the new son of node (1) */
-
son = node->sons;
-
while(son->right_sibling != 0)
-
son = son->right_sibling;
-
connect_siblings(son, new_leaf);
-
/* return (4) */
-
returnnew_leaf;
-
}
-
/*-------split-------*/
-
#ifdef DEBUG
-
printf("rule 2: split (%lu,%lu)\n",edge_label_begin,edge_label_end);
-
#endif
-
/* Create a new internal node (3) at the split point */
-
new_internal = create_node(
-
node->father,
-
node->edge_label_start,
-
node->edge_label_start+edge_pos,
-
node->path_position);
-
/* Update the node (1) incoming edge starting index (it now starts where node
-
(3) incoming edge ends) */
-
node->edge_label_start += edge_pos+1;
-
-
/* Create a new leaf (2) with the characters of the extension */
-
new_leaf = create_node(
-
new_internal,
-
edge_label_begin,
-
edge_label_end,
-
path_pos);
-
-
/* Connect new_internal (3) where node (1) was */
-
/* Connect (3) with (1)'s left sibling */
-
connect_siblings(node->left_sibling, new_internal);
-
/* connect (3) with (1)'s right sibling */
-
connect_siblings(new_internal, node->right_sibling);
-
node->left_sibling = 0;
-
-
/* Connect (3) with (1)'s father */
-
if(new_internal->father->sons == node)
-
new_internal->father->sons = new_internal;
-
-
/* Connect new_leaf (2) and node (1) as sons of new_internal (3) */
-
new_internal->sons = node;
-
node->father = new_internal;
-
connect_siblings(node, new_leaf);
-
/* return (3) */
-
returnnew_internal;
-
}
-
-
/******************************************************************************/
-
/*
-
trace_single_edge :
-
Traces for a string in a given node's OUTcoming edge. It searches only in the
-
given edge and not other ones. Search stops when either whole string was
-
found in the given edge, a part of the string was found but the edge ended
-
(and the next edge must be searched too - performed by function trace_string)
-
or one non-matching character was found.
-
-
Input : The string to be searched, given in indices of the main string.
-
-
Output: (by value) the node where tracing has stopped.
-
(by reference) the edge position where last match occured, the string
-
position where last match occured, number of characters found, a flag
-
for signaling whether search is done, and a flag to signal whether
-
search stopped at a last character of an edge.
-
*/
-
-
NODE* trace_single_edge(
-
SUFFIX_TREE* tree,
-
/* Node to start from */
-
NODE* node,
-
/* String to trace */
-
PATH str,
-
/* Last matching position in edge */
-
DBL_WORD* edge_pos,
-
/* Last matching position in tree source string */
-
DBL_WORD* chars_found,
-
/* Skip or no_skip*/
-
SKIP_TYPE type,
-
/* 1 if search is done, 0 if not */
-
int* search_done)
-
{
-
NODE* cont_node;
-
DBL_WORD length,str_len;
-
-
/* Set default return values */
-
*search_done = 1;
-
*edge_pos = 0;
-
-
/* Search for the first character of the string in the outcoming edge of
-
node */
-
cont_node = find_son(tree, node, tree->tree_string[str.begin]);
-
if(cont_node == 0)
-
{
-
/* Search is done, string not found */
-
*edge_pos = get_node_label_length(tree,node)-1;
-
*chars_found = 0;
-
returnnode;
-
}
-
-
/* Found first character - prepare for continuing the search */
-
node = cont_node;
-
length = get_node_label_length(tree,node);
-
str_len = str.end - str.begin + 1;
-
-
/* Compare edge length and string length. */
-
/* If edge is shorter then the string being searched and skipping is
-
enabled - skip edge */
-
if(type == skip)
-
{
-
if(length <= str_len)
-
{
-
(*chars_found) = length;
-
(*edge_pos) = length-1;
-
if(length < str_len)
-
*search_done = 0;
-
}
-
else
-
{
-
(*chars_found) = str_len;
-
(*edge_pos) = str_len-1;
-
}
-
-
#ifdef STATISTICS
-
counter++;
-
#endif
-
-
returnnode;
-
}
-
else
-
{
-
/* Find minimum out of edge length and string length, and scan it */
-
if(str_len < length)
-
length = str_len;
-
-
for(*edge_pos=1, *chars_found=1; *edge_pos
-
{
-
-
#ifdef STATISTICS
-
counter++;
-
#endif
-
-
/* Compare current characters of the string and the edge. If equal -
-
continue */
-
if(tree->tree_string[node->edge_label_start+*edge_pos] != tree->tree_string[str.begin+*edge_pos])
-
{
-
(*edge_pos)--;
-
returnnode;
-
}
-
}
-
}
-
-
/* The loop has advanced *edge_pos one too much */
-
(*edge_pos)--;
-
-
if((*chars_found) < str_len)
-
/* Search is not done yet */
-
*search_done = 0;
-
-
returnnode;
-
}
-
-
/******************************************************************************/
-
/*
-
trace_string :
-
Traces for a string in the tree. This function is used in construction
-
process only, and not for after-construction search of substrings. It is
-
tailored to enable skipping (when we know a suffix is in the tree (when
-
following a suffix link) we can avoid comparing all symbols of the edge by
-
skipping its length immediately and thus save atomic operations - see
-
Ukkonen's algorithm, skip trick).
-
This function, in contradiction to the function trace_single_edge, 'sees' the
-
whole picture, meaning it searches a string in the whole tree and not just in
-
a specific edge.
-
-
Input : The string, given in indice of the main string.
-
-
Output: (by value) the node where tracing has stopped.
-
(by reference) the edge position where last match occured, the string
-
position where last match occured, number of characters found, a flag
-
for signaling whether search is done.
-
*/
-
-
NODE* trace_string(
-
SUFFIX_TREE* tree,
-
/* Node to start from */
-
NODE* node,
-
/* String to trace */
-
PATH str,
-
/* Last matching position in edge */
-
DBL_WORD* edge_pos,
-
/* Last matching position in tree string */
-
DBL_WORD* chars_found,
-
/* skip or not */
-
SKIP_TYPE type)
-
{
-
/* This variable will be 1 when search is done.
-
It is a return value from function trace_single_edge */
-
intsearch_done = 0;
-
-
/* This variable will hold the number of matching characters found in the
-
current edge. It is a return value from function trace_single_edge */
-
DBL_WORD edge_chars_found;
-
-
*chars_found = 0;
-
-
while(search_done == 0)
-
{
-
*edge_pos = 0;
-
edge_chars_found = 0;
-
node = trace_single_edge(tree, node, str, edge_pos, &edge_chars_found, type, &search_done);
-
str.begin += edge_chars_found;
-
*chars_found += edge_chars_found;
-
}
-
returnnode;
-
}
-
-
/******************************************************************************/
-
/*
-
ST_FindSubstring :
-
See suffix_tree.h for description.
-
*/
-
-
DBL_WORD ST_FindSubstring(
-
/* The suffix array */
-
SUFFIX_TREE* tree,
-
/* The substring to find */
-
char* W,
-
/* The length of W */
-
DBL_WORD P)
-
{
-
/* Starts with the root's son that has the first character of W as its
-
incoming edge first character */
-
NODE* node = find_son(tree, tree->root, W[0]);
-
DBL_WORD k,j = 0, node_label_en
CDA数据分析师考试相关入口一览(建议收藏):
▷ 想报名CDA认证考试,点击>>>
“CDA报名”
了解CDA考试详情;
▷ 想加入CDA考试题库,点击>>> “CDA题库” 了解CDA考试详情;
▷ 想学习CDA考试教材,点击>>> “CDA教材” 了解CDA考试详情;
▷ 想查询CDA考试成绩,点击>>> “CDA成绩” 了解CDA考试详情;
▷ 想了解CDA考试含金量,点击>>> “CDA含金量” 了解CDA考试详情;
▷ 想获取CDA考试时间/费用/条件/大纲/通过率,点击 >>>“CDA考试官网” 了解CDA考试详情;