天天看点

IK分词器1、引入依赖2、IKUtil工具类3、自定义配置类

1、引入依赖

<dependency>
			<groupId>com.janeluo</groupId>
			<artifactId>ikanalyzer</artifactId>
			<version>2012_u6</version>
		</dependency>
           

2、IKUtil工具类

import com.asiainfo.biapp.aiop.web.product.config.CustomConfiguration;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
import org.wltea.analyzer.lucene.IKAnalyzer;

import java.io.IOException;
import java.io.StringReader;
import java.util.HashSet;
import java.util.Set;


public class IKUtil {

    private static final Logger logger = LoggerFactory.getLogger(IKUtil.class);

    /**
     * IKAnalyzer是IK分词器针对于Lucene的默认实现,虽然该类没有传入自定义配置类的
     * 构造方法,但配置文件默认是classpath下的IKAnalyzer.cfg,可以扩展词典和停止词
     */
    public static String[] separate(String... textArr){
        Set<String> wordSet = new HashSet<>();


        for (String text : textArr) {
            //创建分词对象,true:智能切分,false:最细粒度切分
            Analyzer anal=new IKAnalyzer(true);
            //分词
            TokenStream ts= null;
            try(StringReader reader=new StringReader(text)) {
                ts = anal.tokenStream("", reader);
                //要先reset一下,否则会报错
                ts.reset();
                CharTermAttribute term=ts.getAttribute(CharTermAttribute.class);
                //遍历分词数据
                while(ts.incrementToken()){
                    wordSet.add(term.toString());
                }
            } catch (IOException e) {
                logger.error("分词异常",e);
            }

        }
        String[] arr = new String[wordSet.size()];
        return wordSet.toArray(arr);
    }

    public static String[] parse(String text){
        Set<String> wordSet = new HashSet<>();

        StringReader sr = new StringReader(text);
        IKSegmenter ikSegmenter = new IKSegmenter(sr, CustomConfiguration.getInstance());
        Lexeme word;
        String wordStr;
        try {
            while((word = ikSegmenter.next()) != null){
                wordStr = word.getLexemeText();
                wordSet.add(wordStr);
            }
        } catch (IOException e) {
            logger.error("获取分词结果异常",e);
        }
        String[] arr = new String[wordSet.size()];
        return wordSet.toArray(arr);

    }

    public static void main(String[] args) {
        String text = "我是网管智慧中台的老大";
        String[] arr = IKUtil.parse(text);
        for (String s : arr) {
            System.out.println(s);
        }
    }

}
           

3、自定义配置类

import org.wltea.analyzer.cfg.Configuration;

import java.util.ArrayList;
import java.util.List;

/**
 * @author xulong3
 * @Title: file_name
 * @Package package_name
 * @Description: todo
 * @date 2020/7/1 17:14
 */
public class CustomConfiguration implements Configuration{

    private static final String PATH_DIC_MAIN = "words/main.dic";
    private static final String PATH_DIC_STOPWORD = "words/stopword.dic";
    private static final String PATH_DIC_QUANTIFIER = "org/wltea/analyzer/dic/quantifier.dic";


    private static CustomConfiguration customConfiguration = new CustomConfiguration();
    private CustomConfiguration(){}

    public static CustomConfiguration getInstance(){
        return customConfiguration;
    }

    private boolean useSmart;

    @Override
    public boolean useSmart() {
        return useSmart;
    }

    @Override
    public void setUseSmart(boolean useSmart) {
        this.useSmart = useSmart;
    }

    @Override
    public String getMainDictionary() {
        return PATH_DIC_MAIN;
    }

    @Override
    public String getQuantifierDicionary() {
        return PATH_DIC_QUANTIFIER;
    }

    @Override
    public List<String> getExtDictionarys() {
        return null;
    }

    @Override
    public List<String> getExtStopWordDictionarys() {
        List<String> pathList = new ArrayList<>();
        pathList.add(PATH_DIC_STOPWORD);
        return pathList;
    }
}
           

继续阅读