天天看點

Elasticsearch 拼音分詞器

安裝步驟

下載下傳位址:https://github.com/medcl/elasticsearch-analysis-pinyin/releases/tag/v7.2.0/elasticsearch-analysis-pinyin-7.2.0.zip

建立檔案夾并上傳解壓檔案(所有節點)

[[email protected] elasticsearch]# ls
bin  config  data  jdk  lib  LICENSE.txt  logs  modules  NOTICE.txt  plugins  README.textile
[[email protected] elasticsearch]# cd plugins/
[[email protected] plugins]# ls
analysis-ik
[[email protected] plugins]# mkdir pinyin
[[email protected] plugins]# cd pinyin/
[[email protected] pinyin]# ls
elasticsearch-analysis-pinyin-7.2.0.zip
[[email protected] pinyin]# unzip elasticsearch-analysis-pinyin-7.2.0.zip
Archive:  elasticsearch-analysis-pinyin-7.2.0.zip
  inflating: plugin-descriptor.properties
  inflating: elasticsearch-analysis-pinyin-7.2.0.jar
  inflating: nlp-lang-1.7.jar
           

修改檔案權限

[[email protected] plugins]# chown -R elastic:elastic ./pinyin/
[[email protected] plugins]# ll
total 0
drwxr-xr-x. 3 elastic elastic 243 Jul 29 15:53 analysis-ik
drwxr-xr-x. 2 elastic elastic 113 Aug  8 17:46 pinyin
           

重新開機叢集

使用方式

測試拼音分詞器:

GET /_analyze
{
  "text":"劉德華",
  "analyzer": "pinyin"
}

{
  "tokens" : [
    {
      "token" : "liu",
      "start_offset" : 0,
      "end_offset" : 0,
      "type" : "word",
      "position" : 0
    },
    {
      "token" : "ldh",
      "start_offset" : 0,
      "end_offset" : 0,
      "type" : "word",
      "position" : 0
    },
    {
      "token" : "de",
      "start_offset" : 0,
      "end_offset" : 0,
      "type" : "word",
      "position" : 1
    },
    {
      "token" : "hua",
      "start_offset" : 0,
      "end_offset" : 0,
      "type" : "word",
      "position" : 2
    }
  ]
}
           

說明:

  • keep_first_letter:劉德華> ldh keep_separate_first_letter:劉德華> l,d,h
  • limit_first_letter_length:first_letter結果的最大長度,預設值:16
  • keep_full_pinyin:劉德華> [ liu,de,hua] keep_joined_full_pinyin:劉德華> [liudehua] keep_none_chinese:結果中保留非中文字母或數字,預設值:true
  • keep_none_chinese_together:預設值:true,如:DJ音樂家- >DJ,yin,yue,jia,當設定為false,例如:DJ音樂家- >D,J,yin,yue,jia,注意:keep_none_chinese必須先啟動
  • keep_none_chinese_in_first_letter:劉德華AT2016- > ldhat2016
  • keep_none_chinese_in_joined_full_pinyin:劉德華2016- > liudehua2016
  • lowercase:小寫非中文字母,預設值:true remove_duplicated_term:de的>de

拼音分詞器:

PUT /express_info_v1/ 
{
    "settings" : {
		"number_of_shards": 3,
		"number_of_replicas": 1,
        "analysis" : {
            "analyzer" : {
                "pinyin_analyzer" : {
                    "tokenizer" : "my_pinyin"
                    }
            },
            "tokenizer" : {
                "my_pinyin" : {
                    "type" : "pinyin",
                    "keep_separate_first_letter" : false,
                    "keep_full_pinyin" : true,
                    "keep_original" : true,
                    "limit_first_letter_length" : 16,
                    "lowercase" : true,
                    "remove_duplicated_term" : true
                }
            }
        }
    }
}
           

取别名:

POST _aliases
{
  "actions": [
    {
      "add": {
        "index": "express_info_v1",
        "alias": "express_info"
      }
    }
  ]
}
           

建立mapping

PUT /express_info_v1/_mappings
{
  "properties":{
    "name":{
      "type":"text",
      "analyzer": "pinyin_analyzer"
    },
    "address":{
      "type":"text",
      "analyzer":"pinyin_analyzer"
    },
    "send_time":{
      "type":"date",
      "format": "yyyy-MM-dd"
    },
    "num":{
      "type":"text",
      "analyzer":"pinyin_analyzer"
    }
  }
}
           

填充資料:

PUT /express_info_v1/_doc/1
{
	"name": "薛蔣柳",
  "address": "康莊街道B-11-8",
  "send_time": "2019-08-07",
  "num":"sf9971618841"
}

PUT /express_info_v1/_doc/2
{
	"name": "袁喻",
  "address": "江西省撫州市黎川縣",
  "send_time": "2019-08-08",
  "num":"ve458634059"
}
           

查詢資料:

GET /express_info/_search
{
  "query": {
    "match": {
      "name": "yy"
    }
  }
}

GET /express_info/_search
{
  "query": {
    "match": {
      "name": "源于"
    }
  }
}

GET /express_info/_search
{
  "query": {
    "match": {
      "name": "薛蔣l"
    }
  }
}

GET /express_info/_search
{
  "query": {
    "match": {
      "name": "xuejiangliu"
    }
  }
}
           

重建index

中文分詞+拼音分詞器

PUT /express_info_v2
{
  "settings": {
		"number_of_shards": 3,
		"number_of_replicas": 1,
        "analysis": {
            "analyzer": {
                "ik_smart_pinyin": {
                    "type": "custom",
                    "tokenizer": "ik_smart",
                    "filter": ["my_pinyin", "word_delimiter"]
                },
                "ik_max_word_pinyin": {
                    "type": "custom",
                    "tokenizer": "ik_max_word",
                    "filter": ["my_pinyin", "word_delimiter"]
                }
            },
            "filter": {
                "my_pinyin": {
                    "type" : "pinyin",
                    "keep_separate_first_letter" : false,
                    "keep_full_pinyin" : true,
                    "keep_original" : true,
                    "limit_first_letter_length" : 16,
                    "lowercase" : true,
                    "remove_duplicated_term" : true 
                }
            }
        }
  }
}
           

建立mapping

PUT /express_info_v2/_mappings
{
  "properties":{
    "name":{
      "type":"text",
      "analyzer": "ik_smart_pinyin"
    },
    "address":{
      "type":"text",
      "analyzer":"ik_smart_pinyin"
    },
    "send_time":{
      "type":"date",
      "format": "yyyy-MM-dd"
    },
    "num":{
      "type":"text",
      "analyzer":"ik_max_word_pinyin"
    }
  }
}
           

資料重載:

POST _reindex
{
  "source": {
    "index": "express_info_v1"
  }, 
  "dest": {
    "index": "express_info_v2"
  }
}
           

使用新index取代原始的index

POST /_aliases
{
  "actions": [
    {
      "remove": {
        "index": "express_info_v1",
        "alias": "express_info"
      }
    },
    {
       "add": {
        "index": "express_info_v2",
        "alias": "express_info"
      }
    }
  ]
}
           

删除原始的index:

DELETE express_info_v1
           

測試:

GET /express_info_v2/_analyze
{
  "text": "江西省撫州市黎川縣"
  , "analyzer": "ik_max_word_pinyin"
}
{
  "tokens" : [
    {
      "token" : "jiang",
      "start_offset" : 0,
      "end_offset" : 3,
      "type" : "CN_WORD",
      "position" : 0
    },
    {
      "token" : "江西省",
      "start_offset" : 0,
      "end_offset" : 3,
      "type" : "CN_WORD",
      "position" : 0
    },
    {
      "token" : "jxs",
      "start_offset" : 0,
      "end_offset" : 3,
      "type" : "CN_WORD",
      "position" : 0
    },
    {
      "token" : "xi",
      "start_offset" : 0,
      "end_offset" : 3,
      "type" : "CN_WORD",
      "position" : 1
    },
    {
      "token" : "sheng",
      "start_offset" : 0,
      "end_offset" : 3,
      "type" : "CN_WORD",
      "position" : 2
    },
    {
      "token" : "fu",
      "start_offset" : 3,
      "end_offset" : 6,
      "type" : "CN_WORD",
      "position" : 3
    },
    {
      "token" : "zhou",
      "start_offset" : 3,
      "end_offset" : 6,
      "type" : "CN_WORD",
      "position" : 4
    },
    {
      "token" : "shi",
      "start_offset" : 3,
      "end_offset" : 6,
      "type" : "CN_WORD",
      "position" : 5
    },
    {
      "token" : "撫州市",
      "start_offset" : 3,
      "end_offset" : 6,
      "type" : "CN_WORD",
      "position" : 5
    },
    {
      "token" : "fzs",
      "start_offset" : 3,
      "end_offset" : 6,
      "type" : "CN_WORD",
      "position" : 5
    },
    {
      "token" : "li",
      "start_offset" : 6,
      "end_offset" : 9,
      "type" : "CN_WORD",
      "position" : 6
    },
    {
      "token" : "chuan",
      "start_offset" : 6,
      "end_offset" : 9,
      "type" : "CN_WORD",
      "position" : 7
    },
    {
      "token" : "xian",
      "start_offset" : 6,
      "end_offset" : 9,
      "type" : "CN_WORD",
      "position" : 8
    },
    {
      "token" : "黎川縣",
      "start_offset" : 6,
      "end_offset" : 9,
      "type" : "CN_WORD",
      "position" : 8
    },
    {
      "token" : "lcx",
      "start_offset" : 6,
      "end_offset" : 9,
      "type" : "CN_WORD",
      "position" : 8
    }
  ]
}
           

繼續閱讀