天天看點

字尾數組的自頂向下(top-down)周遊算法

類似于"字尾數組的自底向上(bottom-up)周遊算法",也可以對Enhanced Suffix Array執行自頂向下(top-down)周遊。這裡通過增強資訊--lcptable和childtab--來自頂向下周遊字尾數組。每一個字尾的childtab元素包含up,down和nexLIndex三項。up或down值指向某個lcp-interval的第一個l-index;nextLIndex指向下一個l-index,這樣所有的孩子區間都能在childtab的基礎上得出:設某個lcp-interval[i..j]的l-indices為i1<i2<...<ik,那麼第一個孩子區間為[i..i1-1],其他的孩子區間為[i1..i2-1], ... [ik...j];并且childtab[i].down和childtab[j+1].up兩個值之間至少有一個值等于第一個l-index,其他的l-index可以通過childtab[i1].nextLIndex得出。計算childtab的算法參考“Replacing suffix tree with enhanced suffix arrays”。

實作:

import java.util.ArrayList;
import java.util.List;
import java.util.Stack;
 
/**
 * 
 * Top-down Traversal of a suffix array (with lcptable and childtab)
 * (The suffix array is constructed with DC3 algorithm)
 * 
 *  
 * Copyright (c) 2011 ljs (http://blog.csdn.net/ljsspace/)
 * Licensed under GPL (http://www.opensource.org/licenses/gpl-license.php) 
 * 
 * @author ljs
 * 2011-07-25
 *
 */ 
public class ESA_TopDownTraversal {
	public class LIndexInfo {
		private int up=-1;
		private int down=-1;
		private int nextLIndex = -1;
		public String toString(){
			return up + ":" + down +  ":" + nextLIndex;
		}
	}
	private String text;
	private int[] lcptable;
	private LIndexInfo[] childtab;
	private int len;
	
	public ESA_TopDownTraversal(String text){
		this.text = text;
		this.len = text.length();
		
		childtab=new LIndexInfo[len];
		
		for(int i=0;i<len;i++){
			childtab[i] = new LIndexInfo();
		}
	}
	
	public static final char MAX_CHAR = '\u00FF';

	class Suffix{
		int[] sa;  
		//Note: the p-th suffix in sa: SA[rank[p]-1]];
		//p is the index of the array "rank", start with 0;
		//a text S's p-th suffix is S[p..n], n=S.length-1.
		int[] rank; 
		boolean done;
		 
		public Suffix(int[] sa,int[] rank){
			this.sa = sa;
			this.rank = rank;
		}
	}
	

	//a prefix of suffix[isuffix] represented with digits
	class Tuple{
		int isuffix; //the p-th suffix
		int[] digits;
		public Tuple(int suffix,int[] digits){
			this.isuffix = suffix;
			this.digits = digits;			
		}
		public String toString(){
			StringBuffer sb = new StringBuffer();			
			sb.append(isuffix);
			sb.append("(");
			for(int i=0;i<digits.length;i++){
				sb.append(digits[i]);
				if(i<digits.length-1)
					sb.append("-");
			}
			sb.append(")");
			return sb.toString();
		}
	}
	
	//d: the digit to do countingsort
	//max: A value's range is 0...max
	private void countingSort(int d,Tuple[] tA,Tuple[] tB,int max){
		//init the counter array
		int[] C = new int[max+1];
		for(int i=0;i<=max;i++){
			C[i] = 0;
		}
		//stat the count
		for(int j=0;j<tA.length;j++){
			C[tA[j].digits[d]]++;
		}
		//process the counter array C
		for(int i=1;i<=max;i++){
			C[i]+=C[i-1];
		}
		//distribute the values  
		for(int j=tA.length-1;j>=0;j--){
			//C[A[j]] <= A.length 
			tB[--C[tA[j].digits[d]]]=tA[j];			
		}
	}
	
	//tA: input
	//tB: output for rank caculation
	private void radixSort(Tuple[] tA,Tuple[] tB,int max,int digitsLen){
		int len = tA.length;
		int digitsTotalLen = tA[0].digits.length;
			
		for(int d=digitsTotalLen-1,j=0;j<digitsLen;d--,j++){
			this.countingSort(d, tA, tB, max);
			//assign tB to tA
			if(j<digitsLen-1){
				for(int i=0;i<len;i++){
					tA[i] = tB[i];
				}		
			}
		}
	}
	
	//max is the maximum value in any digit of TA.digits[], used for counting sort
	//tA: input
	//tB: the place holder, reused between iterations
	private Suffix rank(Tuple[] tA,Tuple[] tB,int max,int digitsLen){		
		int len = tA.length;		
		radixSort(tA,tB,max,digitsLen);	
		
		int digitsTotalLen = tA[0].digits.length;
		
		//caculate rank and sa	
		int[] sa = new int[len];
		sa[0] = tB[0].isuffix;	
		
		int[] rank = new int[len+2]; //add 2 for sentinel	
		rank[len]=1;rank[len+1] = 1;
		int r = 1; //rank starts with 1
		rank[tB[0].isuffix] = r;		
		for(int i=1;i<len;i++){
			sa[i] = tB[i].isuffix;	
			
			boolean equalLast = true;
			for(int j=digitsTotalLen-digitsLen;j<digitsTotalLen;j++){
				if(tB[i].digits[j]!=tB[i-1].digits[j]){
					equalLast = false;
					break;
				}
			}
			if(!equalLast){
				r++;
			}
			rank[tB[i].isuffix] = r;	
		}
				 
		Suffix suffix = new Suffix(sa,rank);
		//judge if we are done
		if(r==len){
			suffix.done = true;
		}else{
			suffix.done = false;
		}
		return suffix;
		
	}
	
	private int[] orderSuffixes(Tuple[] tA,Tuple[] tB,int max,int digitsLen){		
		int len = tA.length;		
		radixSort(tA,tB,max,digitsLen);			
		//caculate rank and sa	
		int[] sa = new int[len];
		for(int i=0;i<len;i++){
			sa[i] = tB[i].isuffix;				
		}
		return sa;		 
	}
	
	//rank needs sentinel: len+2
	private Suffix reduce(int[] rank,int max){
		int len = rank.length - 2;
		
		int n1 = (len+1)/3;
		int n2 = len/3;
		Tuple[] tA = new Tuple[n1+n2];
		Tuple[] tB = new Tuple[n1+n2];
		
		for(int i=0,j=1;i<n1;i++,j+=3){
			int r1 =  rank[j];
			int r2 =  rank[j+1];
			int r3 =  rank[j+2];
			tA[i] = new Tuple(i,new int[]{r1,r2,r3});
		}
		for(int i=n1,j=2;i<n1+n2;i++,j+=3){
			int r1 =  rank[j];
			int r2 =  rank[j+1];
			int r3 =  rank[j+2];	 
			tA[i] = new Tuple(i,new int[]{r1,r2,r3});
		}
		 
		return rank(tA,tB,max,3);		
	}
	
	
	private int[] skew(int[] rank,int max){
		int len = rank.length - 2;
		
		//step 1: caculate sa12
		Suffix suffixT12 = reduce(rank,max);
		 
		
		int[] sa12 = null;
		if(!suffixT12.done){
			int[] rankT12 = suffixT12.rank;
			int maxT12 = rankT12[suffixT12.sa[suffixT12.sa.length-1]];
			sa12 = skew(rankT12,maxT12);
			// debug for string: GACCCACCACC#
			//s12 = new Suffix();
			//s12.rank = new int[]{3,6,5,4,7,2,1,1,1};
			//s12.sa = new int[]{7,6,5,0,3,2,1,4};
			//s12.done =true;						
		}else{
			sa12 = suffixT12.sa;			
		}
		
		//index conversion for sa12
		int n1 = (len+1)/3;
		for(int j=0;j<sa12.length;j++){
			if(sa12[j]<n1){
				sa12[j] = 1 + 3*sa12[j];
			}else{
				sa12[j] = 2 + 3*(sa12[j]-n1);
			}				
		}
		//recaculate rank for sa12
		int[] rank12 = new int[len+2];
		rank12[len] = 1;rank12[len+1] = 1;
		for(int k=0;k<sa12.length;k++){
			rank12[sa12[k]] = k+1;
		}
		 
		  
		
		//step 2: caculate sa0		
		int n0=(len+2)/3;
		Tuple[] tA = new Tuple[n0];
		Tuple[] tB = new Tuple[n0];
		for(int i=0,j=0;i<n0;i++,j+=3){
			int r1 =  rank[j];
			int r2 =  rank12[j+1]; 
			tA[i] = new Tuple(i,new int[]{r1,r2});
		}
		int max12 = rank12[sa12[sa12.length-1]];		
		int[] sa0 = orderSuffixes(tA,tB,max<max12?max12:max,2);
		//index conversion for sa0
		for(int j=0;j<n0;j++){
			sa0[j] = 3*sa0[j];					
		}		 
		
		//step 3: merge sa12 and sa0
		int[] sa = new int[len];
		int i=0,j=0;
		int k=0;
		while(i<sa12.length && j<sa0.length){
			int p = sa12[i];
			int q = sa0[j];
			if(p%3==1){
				//case 1
				if(rank[p]<rank[q]){
					sa[k++] = p;i++;
				}else if(rank[p]>rank[q]){
					sa[k++] = q;j++;
				}else{
					if(rank12[p+1]<rank12[q+1]){
						sa[k++] = p;i++;
					}else{
						sa[k++] = q;j++;
					}					
				}
			}else{
				//case 2
				if(rank[p]<rank[q]){
					sa[k++] = p;i++;
				}else if(rank[p]>rank[q]){
					sa[k++] = q;j++;
				}else{
					if(rank[p+1]<rank[q+1]){
						sa[k++] = p;i++;
					}else if(rank[p+1]>rank[q+1]){
						sa[k++] = q;j++;
					}else{
						if(rank12[p+2]<rank12[q+2]){
							sa[k++] = p;i++;
						}else{
							sa[k++] = q;j++;
						}			
					}
				}
			}			
		}
		for(int m=i;m<sa12.length;m++){
			sa[k++] = sa12[m];
		}
		for(int m=j;m<sa0.length;m++){
			sa[k++] = sa0[m];
		}		
		
		return sa;		
	}
	//Precondition: the last char in text must be less than other chars.
	private Suffix DC3(String text){
		if(text == null)return null;
		int len = text.length();
		if(len == 0) return null;
		
		char base = text.charAt(len-1); //the smallest char
		Tuple[] tA = new Tuple[len];
		Tuple[] tB = new Tuple[len]; //placeholder
		for(int i=0;i<len;i++){
			tA[i] = new Tuple(i,new int[]{0,text.charAt(i)-base});
		}
		Suffix suffix = rank(tA,tB,MAX_CHAR-base,1);
		 
		int max = suffix.rank[suffix.sa[len-1]];
		int[] sa  = skew(suffix.rank,max);
		
		//caculate rank for result suffix array
		int[] r = new int[len];		
		for(int k=0;k<sa.length;k++){
			r[sa[k]] = k+1;
		}
		return new Suffix(sa,r);
		
	}
	
	
	public void buildChildtab(){
		//step 1: caculate up and down value
		Stack<Integer> stack = new Stack<Integer>();
		int lastIndex = -1;
		stack.push(0);
		for(int i=1;i<len;i++){ 
			while(lcptable[i]<lcptable[stack.peek()]){
				lastIndex = stack.pop();
				int next = stack.peek();
				if(lcptable[i]<=lcptable[next] 
				        && lcptable[next] != lcptable[lastIndex]){
					childtab[next].down = lastIndex; 
				}
			}
			if(lastIndex != -1){
				childtab[i].up = lastIndex;
				lastIndex = -1;
			}
			stack.push(i);
		}
		//process remaining elements
		while(0<lcptable[stack.peek()]){
			lastIndex = stack.pop();
			int next = stack.peek();
			if(0<=lcptable[next] 
			        && lcptable[next] != lcptable[lastIndex]){
				childtab[next].down = lastIndex; 
			}
		}
		
		
		//step 2: caculate nextLIndex
		stack.clear();
		stack.push(0);
		for(int i=1;i<len;i++){ 
			while(lcptable[i]<lcptable[stack.peek()]){
				stack.pop();				
			}
			if(lcptable[i] == lcptable[stack.peek()]){				
				lastIndex = stack.pop();
				childtab[lastIndex].nextLIndex = i;
			}
			stack.push(i);
		}		
	
		//System.out.println();
	}
	
	class LCPInterval{
		int lcp; //the lcp value of the lcp-interval
		int lb; //the left boundary suffix index
		int rb; //the right boundary suffix index
		List<LCPInterval> children;
		public LCPInterval(int lcp,int lb,int rb){
			this.lcp = lcp;
			this.lb = lb;
			this.rb = rb;
		}
		public String toString(){
			return String.format("%d-[%d..%d]", 
					this.lcp,this.lb,this.rb);
		}
	}	
	 
	//Note: (i,j) != (0,n-1)
	private List<LCPInterval> getChildIntervals(int i,int j){		
		List<LCPInterval> children = new ArrayList<LCPInterval>();
		 		
		int up = -1;
		if(j+1<len){
			up = childtab[j+1].up;
		}
		int i1 = -1;
		if(up>i && up<=j){
			i1 = up;
		}else{
			i1 = childtab[i].down;
		}		
		if(i<i1-1){
			int lcp = getlcp(i,i1-1);
			children.add(new LCPInterval(lcp,i,i1-1));
		}
		
		while(childtab[i1].nextLIndex != -1){
			int i2 = childtab[i1].nextLIndex;			
			if(i1<i2-1){
				int lcp = getlcp(i1,i2-1);
				children.add(new LCPInterval(lcp,i1,i2-1));	
			}
			i1 = i2;
		}
		if(i1<j){
			int lcp = getlcp(i1,j);
			children.add(new LCPInterval(lcp,i1,j));
		}
		return children;
	}
	//Note: (i,j) != (0,n-1)
	private int getlcp(int i,int j){ 
		if(i==j) return lcptable[i];
		
		int up = -1;
		if(j+1<len){
			up = childtab[j+1].up;
		}

		if(up>i && up<=j){
			return lcptable[up];
		}else{
			return lcptable[childtab[i].down];
		}
	}
	private void topDownTraverse(LCPInterval interval){		
		List<LCPInterval> childIntervals = getChildIntervals(interval.lb,interval.rb);
		interval.children = childIntervals;
		reportLCPInterval(interval);
		for(LCPInterval child:childIntervals){
			topDownTraverse(child);
		}
	}
	public void topDownTraverse(int[] lcptable){
		this.lcptable = lcptable;
		
		this.buildChildtab();
		
		
		LCPInterval root = new LCPInterval(0,0,len-1);
		
		
		List<LCPInterval> childIntervals = new ArrayList<LCPInterval>();
		
		int i1 = this.childtab[0].nextLIndex;
		if(i1-1>0){
			int lcp = getlcp(0,i1-1);
			childIntervals.add(new LCPInterval(lcp,0,i1-1));
		}
		while(childtab[i1].nextLIndex != -1){
			int i2 = childtab[i1].nextLIndex;	
			if(i1<i2-1){
				int lcp = getlcp(i1,i2-1);
				childIntervals.add(new LCPInterval(lcp,i1,i2-1));	
			}
			i1 = i2;
		}
		if(i1<len-1){
			int lcp = getlcp(i1,len-1);
			childIntervals.add(new LCPInterval(lcp,i1,len-1));
		}
		
		root.children = childIntervals;		
		reportLCPInterval(root);
		
		for(LCPInterval child:childIntervals){
			topDownTraverse(child);
		}
	}
	private void reportLCPInterval(LCPInterval interval){
		if(interval.children.size()>0){
			StringBuilder sb = new StringBuilder();
			for(LCPInterval child:interval.children){
				sb.append(child.toString());
				sb.append(",");
			}
			sb.deleteCharAt(sb.length()-1);
			System.out.format("%s, children={%s}%n", 
					interval,sb.toString());				
		}else{
			System.out.format("%s%n", interval);	
		}	
	}
	//rank[p]'s index starts with 1 (not 0)
	public int[] computeLCPTable(String text,int[] sa,int[] rank){
		if(text == null)return null;
		int len = text.length();
		if(len == 0) return null;
		 
		int[] lcpz = new int[len];
		
		//base case: p=0
		//caculate LCP of suffix[0]
		int lcp = 0;
		int r = rank[0]-1;
		if(r>0){
		   int q=sa[r-1];
		   //caculate LCP by definition
		   for(int i=0,j=q;i<len && j<len;i++,j++){
			   if(text.charAt(i) != text.charAt(j)){
				   lcp=i;
				   break;
			   }
		   }
		}
		lcpz[0] = lcp;
		
		//other cases: p>=1
		//ignore p == sa[0] because LCP=0 for suffix[p] where rank[p]=0				
		for(int p=1;p<len && p != sa[0];p++){
			int h = lcpz[p-1];
			int q=sa[rank[p]-2];
			lcp = 0;
			if(h>1){ //for h<=1, caculate LCP by definition (i.e. start with lcp=0)			
				//jump h-1 chars for suffix[p] and suffix[q]						
				lcp = h-1;			    
			}
			for(int i=p+lcp,j=q+lcp,k=0;i<len && j<len;i++,j++,k++){
			   if(text.charAt(i) != text.charAt(j)){
				   lcp+=k;
				   break;
			   }
			}
			lcpz[p] = lcp;
		}
		
		//caculate LCP
		int[] LCP = new int[len];
		for(int i=0;i<len;i++){
			LCP[i] = lcpz[sa[i]];
		}
		return LCP;
	}
 

	public void solve(){
		Suffix suffix = this.DC3(this.text);
		int[] sa = suffix.sa;
		int[] rank = suffix.rank;		
		
		int[] lcptable = this.computeLCPTable(this.text,sa,rank);
	 
		
		this.topDownTraverse(lcptable);		
		
	}
	
	public static void main(String[] args) { 
		//int[] lcptable = {0,2,1,3,1,2,0,2,0,1,0};
		//int[] lcptable = {0,0,3,3,2,2,1,1,0,1};
		String text = "mississippi#";
		System.out.format("Internal Nodes for text: %s %n%n",text);
		ESA_TopDownTraversal esa = new ESA_TopDownTraversal(text);		
		esa.solve();
						
		System.out.format("%n********************************%n");
		text = "GACCCACCACC#";
		System.out.format("Internal Nodes for text: %s %n%n",text);
		esa = new ESA_TopDownTraversal(text);		
		esa.solve();
		
		System.out.format("%n********************************%n");		
		text = "abcdefghijklmmnopqrstuvwxyz#";
		System.out.format("Internal Nodes for text: %s %n%n",text);
		esa = new ESA_TopDownTraversal(text);		
		esa.solve();
		
		System.out.format("%n********************************%n");		
		text = "yabbadabbado#";
		System.out.format("Internal Nodes for text: %s %n%n",text);
		esa = new ESA_TopDownTraversal(text);		
		esa.solve();
		
		System.out.format("%n********************************%n");		
		text = "AAAAAAAAAAAAAAAAAAAAAAAAAA#";
		System.out.format("Internal Nodes for text: %s %n%n",text);
		esa = new ESA_TopDownTraversal(text);		
		esa.solve();
		
		System.out.format("%n********************************%n");		
		text = "GGGGGGGGGGGGCGCAAAAGCGAGCAGAGAGAAAAAAAAAAAAAAAAAAAAAA#";
		System.out.format("Internal Nodes for text: %s %n%n",text);
		esa = new ESA_TopDownTraversal(text);		
		esa.solve();
	}

}
           

測試:

Internal Nodes for text: mississippi#

0-[0..11], children={1-[1..4],1-[6..7],1-[8..11]}

1-[1..4], children={4-[3..4]}

4-[3..4]

1-[6..7]

1-[8..11], children={2-[8..9],3-[10..11]}

2-[8..9]

3-[10..11]

********************************

Internal Nodes for text: GACCCACCACC#

0-[0..11], children={3-[1..3],1-[4..10]}

3-[1..3]

1-[4..10], children={4-[5..6],2-[7..10]}

4-[5..6]

2-[7..10], children={5-[8..9]}

5-[8..9]

********************************

Internal Nodes for text: abcdefghijklmmnopqrstuvwxyz#

0-[0..27], children={1-[13..14]}

1-[13..14]

********************************

Internal Nodes for text: yabbadabbado#

0-[0..12], children={1-[1..4],1-[5..8],1-[9..10]}

1-[1..4], children={5-[1..2],2-[3..4]}

5-[1..2]

2-[3..4]

1-[5..8], children={3-[5..6],4-[7..8]}

3-[5..6]

4-[7..8]

1-[9..10]

********************************

Internal Nodes for text: AAAAAAAAAAAAAAAAAAAAAAAAAA#

0-[0..26], children={1-[1..26]}

1-[1..26], children={2-[2..26]}

2-[2..26], children={3-[3..26]}

3-[3..26], children={4-[4..26]}

4-[4..26], children={5-[5..26]}

5-[5..26], children={6-[6..26]}

6-[6..26], children={7-[7..26]}

7-[7..26], children={8-[8..26]}

8-[8..26], children={9-[9..26]}

9-[9..26], children={10-[10..26]}

10-[10..26], children={11-[11..26]}

11-[11..26], children={12-[12..26]}

12-[12..26], children={13-[13..26]}

13-[13..26], children={14-[14..26]}

14-[14..26], children={15-[15..26]}

15-[15..26], children={16-[16..26]}

16-[16..26], children={17-[17..26]}

17-[17..26], children={18-[18..26]}

18-[18..26], children={19-[19..26]}

19-[19..26], children={20-[20..26]}

20-[20..26], children={21-[21..26]}

21-[21..26], children={22-[22..26]}

22-[22..26], children={23-[23..26]}

23-[23..26], children={24-[24..26]}

24-[24..26], children={25-[25..26]}

25-[25..26]

********************************

Internal Nodes for text: GGGGGGGGGGGGCGCAAAAGCGAGCAGAGAGAAAAAAAAAAAAAAAAAAAAAA#

0-[0..53], children={1-[1..30],1-[31..34],1-[35..53]}

1-[1..30], children={2-[2..25],2-[26..30]}

2-[2..25], children={3-[3..24]}

3-[3..24], children={4-[4..23]}

4-[4..23], children={5-[5..22]}

5-[5..22], children={6-[6..22]}

6-[6..22], children={7-[7..22]}

7-[7..22], children={8-[8..22]}

8-[8..22], children={9-[9..22]}

9-[9..22], children={10-[10..22]}

10-[10..22], children={11-[11..22]}

11-[11..22], children={12-[12..22]}

12-[12..22], children={13-[13..22]}

13-[13..22], children={14-[14..22]}

14-[14..22], children={15-[15..22]}

15-[15..22], children={16-[16..22]}

16-[16..22], children={17-[17..22]}

17-[17..22], children={18-[18..22]}

18-[18..22], children={19-[19..22]}

19-[19..22], children={20-[20..22]}

20-[20..22], children={21-[21..22]}

21-[21..22]

2-[26..30], children={3-[26..28],3-[29..30]}

3-[26..28], children={5-[27..28]}

5-[27..28]

3-[29..30]

1-[31..34], children={2-[31..32],2-[33..34]}

2-[31..32]

2-[33..34]

1-[35..53], children={2-[35..38],2-[39..42],2-[43..53]}

2-[35..38], children={3-[36..38]}

3-[36..38], children={4-[36..37]}

4-[36..37]

2-[39..42], children={3-[39..40],3-[41..42]}

3-[39..40]

3-[41..42]

2-[43..53], children={3-[44..53]}

3-[44..53], children={4-[45..53]}

4-[45..53], children={5-[46..53]}

5-[46..53], children={6-[47..53]}

6-[47..53], children={7-[48..53]}

7-[48..53], children={8-[49..53]}

8-[49..53], children={9-[50..53]}

9-[50..53], children={10-[51..53]}

10-[51..53], children={11-[52..53]}

11-[52..53]

繼續閱讀