天天看點

倆個模式比對算法(BMH and shift-Or)

BMH描叙:

The Boyer-Moore-Horspool (BMH) algorithm [Horspool 1980] is a widely-known

search algorithm for a single pattern. The preprocessing phase of the algorithm

consists of calculating the bad character function B(x). It is defined as the distance

from the end of the pattern p1p2 · · · pm to the last occurrence of the character x:

B(x) = min{h | pm−h = x, h > 0}.

If the character x does not appear in the pattern B(x) = m.

In the matching phase the text is processed in windows of length m. First the

last character in the window is compared with the last character of the pattern.

If they match the whole window is compared against the pattern to check for a

match. After that or if the last characters did not match, the window is shifted by

B(x) where x is the last character of the window 

shift-or描叙:

The shift-or algorithm [Baeza-Yates and Gonnet 1992] is a simple bit-parallel algo-

rithm. In the preprocessing phase a bit vector B[x] is initialized for each character

x of the alphabet. The bit in position i is set to 0 in the bit vector if the i:th

character in the pattern is x. Otherwise the bits are set to one.

In the beginning of the matching phase the state vector E is initialized to 1m.

Then the text is read one character at a time and the state vector is updated as

follows:

E = (E << 1)|B[x]

where x is the character read, << moves the bits to the left and | is the bitwise or

operator. If the m:th bit is zero after this update we have found a match.

BMH實作:

#ifndef _CSINGLE_BMH_H

#define  _CSINGLE_BMH_H

#define  SHIFT_SIZE  256

// the single string match algorithm

class  CSingle_BMH

{

private :

     char  shift_table[SHIFT_SIZE];

     int  pattern_len;

     char *  pattern;

public :

    CSingle_BMH( const   char *  pat)

    {

        pattern_len  =  strlen(pat);

        pattern  =   new   char [pattern_len];

        strcpy(pattern,pat);

        memset(shift_table, pattern_len, SHIFT_SIZE * sizeof ( char ));

         for ( int  i  =   0 ; i  <  pattern_len  0 ; i  ++ )

        {

            shift_table[ * (pattern + i)]  =  pattern_len  -  i;

        }

    }

     // if match success, return the location of the first matching

     // else return -1

     int  match_BMH( char *  str_match,  int  str_len);

     ~ CSingle_BMH()

    {

        delete[] pattern;

    }

};

#endif

int  CSingle_BMH::match_BMH( char *  str_match,  int  str_len)

{

     char   * window_end    =  str_match  +  pattern_len  -   1 ;

     char   * pat  =  pattern  +  pattern_len  -   1 ;

     while ((window_end  -  str_match)  <  str_len)

    {

         for ( int  i  =   0 ; i  <  pattern_len; i  ++ )

        {

             if ( * (window_end  -  i)  !=   * (pat  -  i)) 

            {

                window_end  +=  shift_table[ * (pat - i)];

                 break ;

            }

        }

         if ( i  ==  pattern_len)

        {

             return  window_end  -  str_match  +   1 ;

        }

    }

     return   - 1 ;

}

shift-or實作:

#ifndef _CSINGLE_SHIFT_OR_H

#define  _CSINGLE_SHIFT_OR_H

#define  ALPHABET_SIZE 256

// this is the single string match algorithm

class  CSingle_shift_Or

{

private :

    unsigned  int  bit_vector[ALPHABET_SIZE];

    unsigned  int  state;              // the state of match ,because the state is 32_bits ,so the longest pattern is 32 bytes

     int  pattern_len;        // pattern_len == strlen  

public :

    CSingle_shift_Or( const   char *  pat)

    {

        memset(bit_vector, 0xFFFFFFFF ,ALPHABET_SIZE * sizeof (unsigned  int ));

        state  =   0xFFFFFFFF ;

        pattern_len  =  strlen(pat);

         for ( int  i  =   0 , j  =   1 ; i  <  pattern_len; i  ++ , j  <<=   1 )

        {

            bit_vector[ * (pat + i)]  &=   ~ j ;

        }

    }

     int  match_shift_or( char *  str_match,  int  str_len);

};

#endif

int  CSingle_shift_Or::match_shift_or( char *  str_match,  int  str_len)

{

     int  shift_bit  =  pattern_len  -   1 ;

     for ( int  i  =   0 ; i  <  str_len; i  ++ )

    {

        state  =  (state  <<   1 )  |  bit_vector[ * (str_match  +  i)];

         if ((state  &  ( 0x1   <<  shift_bit))  ==   0 )  

        {

             return  i;    // if the pattern_len bit is zero ,match successfully

        }

    }

     return   - 1 ;

}

繼續閱讀