天天看点

lucene分词分析器Analyzer

SimpleAnalyzer

StandardAnalyzer

WhitespaceAnalyzer

StopAnalyzer

测试代码:

import java.io.Reader;

import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.SimpleAnalyzer;

import org.apache.lucene.analysis.StopAnalyzer;

import org.apache.lucene.analysis.StopFilter;

import org.apache.lucene.analysis.Token;

import org.apache.lucene.analysis.Tokenizer;

import org.apache.lucene.analysis.WhitespaceAnalyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

public class TestAnalyzer {

    private static String testString1 = "The quick brown fox jumped over the lazy dogs";

    private static String testString2 = "xy&z mail is - [email protected]";

    public static void testWhitespace(String testString) throws Exception{

        Analyzer analyzer = new WhitespaceAnalyzer();     

        Reader r = new StringReader(testString);     

        Tokenizer ts = (Tokenizer) analyzer.tokenStream("", r);     

        System.err.println("=====Whitespace analyzer====");

        System.err.println("分析方法:空格分割");

        Token t;     

        while ((t = ts.next()) != null) {     

           System.out.println(t.termText());     

        }    

    }

    public static void testSimple(String testString) throws Exception{

        Analyzer analyzer = new SimpleAnalyzer();     

        Reader r = new StringReader(testString);     

        Tokenizer ts = (Tokenizer) analyzer.tokenStream("", r);     

        System.err.println("=====Simple analyzer====");

        System.err.println("分析方法:空格及各种符号分割");

        Token t;     

        while ((t = ts.next()) != null) {     

           System.out.println(t.termText());     

        }    

    }

    public static void testStop(String testString) throws Exception{

        Analyzer analyzer = new StopAnalyzer();     

        Reader r = new StringReader(testString);     

        StopFilter sf = (StopFilter) analyzer.tokenStream("", r);

        System.err.println("=====stop analyzer===="); 

        System.err.println("分析方法:空格及各种符号分割,去掉停止词,停止词包括 is,are,in,on,the等无实际意义的词");

        //停止词

        Token t;     

        while ((t = sf.next()) != null) {     

           System.out.println(t.termText());     

        }    

    }

    public static void testStandard(String testString) throws Exception{

        Analyzer analyzer = new StandardAnalyzer();     

        Reader r = new StringReader(testString);     

        StopFilter sf = (StopFilter) analyzer.tokenStream("", r);

        System.err.println("=====standard analyzer====");

        System.err.println("分析方法:混合分割,包括了去掉停止词,支持汉语");

        Token t;     

        while ((t = sf.next()) != null) {     

            System.out.println(t.termText());     

        }    

    }

    public static void main(String[] args) throws Exception{

//        String testString = testString1;

        String testString = testString2;

        System.out.println(testString);

        testWhitespace(testString);

        testSimple(testString);

        testStop(testString);

        testStandard(testString);

    }

}

运行结果:

xy&z mail is - [email protected]

=====Whitespace analyzer====

分析方法:空格分割

xy&z

mail

is

-

[email protected]

=====Simple analyzer====

分析方法:空格及各种符号分割

xy

z

mail

is

xyz

sohu

com

=====stop analyzer====

分析方法:空格及各种符号分割,去掉停止词,停止词包括 is,are,in,on,the等无实际意义

的词

xy

z

mail

xyz

sohu

com

=====standard analyzer====

分析方法:混合分割,包括了去掉停止词,支持汉语

xy&z

mail

[email protected]