天天看點

五分鐘帶你玩轉Elasticsearch(八)ik分詞器吐血總結

反向索引

五分鐘帶你玩轉Elasticsearch(八)ik分詞器吐血總結

這裡就涉及到了分詞

分詞文法

預設的分詞器

GET _analyze?pretty
  {
    "text": "Haier/海爾 BCD-470WDPG十字對開門風冷變頻一級節能家用官方冰箱"
  }      
{
  "tokens" : [
    {
      "token" : "haier",
      "start_offset" : 0,
      "end_offset" : 5,
      "type" : "<ALPHANUM>",
      "position" : 0
    },
    {
      "token" : "海",
      "start_offset" : 6,
      "end_offset" : 7,
      "type" : "<IDEOGRAPHIC>",
      "position" : 1
    },
    {
      "token" : "爾",
      "start_offset" : 7,
      "end_offset" : 8,
      "type" : "<IDEOGRAPHIC>",
      "position" : 2
    },
    {
      "token" : "bcd",
      "start_offset" : 9,
      "end_offset" : 12,
      "type" : "<ALPHANUM>",
      "position" : 3
    },
    {
      "token" : "470wdpg",
      "start_offset" : 13,
      "end_offset" : 20,
      "type" : "<ALPHANUM>",
      "position" : 4
    },
    {
      "token" : "十",
      "start_offset" : 20,
      "end_offset" : 21,
      "type" : "<IDEOGRAPHIC>",
      "position" : 5
    },
    {
      "token" : "字",
      "start_offset" : 21,
      "end_offset" : 22,
      "type" : "<IDEOGRAPHIC>",
      "position" : 6
    },
    {
      "token" : "對",
      "start_offset" : 22,
      "end_offset" : 23,
      "type" : "<IDEOGRAPHIC>",
      "position" : 7
    },
    {
      "token" : "開",
      "start_offset" : 23,
      "end_offset" : 24,
      "type" : "<IDEOGRAPHIC>",
      "position" : 8
    },
    {
      "token" : "門",
      "start_offset" : 24,
      "end_offset" : 25,
      "type" : "<IDEOGRAPHIC>",
      "position" : 9
    },
    {
      "token" : "風",
      "start_offset" : 25,
      "end_offset" : 26,
      "type" : "<IDEOGRAPHIC>",
      "position" : 10
    },
    {
      "token" : "冷",
      "start_offset" : 26,
      "end_offset" : 27,
      "type" : "<IDEOGRAPHIC>",
      "position" : 11
    },
    {
      "token" : "變",
      "start_offset" : 27,
      "end_offset" : 28,
      "type" : "<IDEOGRAPHIC>",
      "position" : 12
    },
    {
      "token" : "頻",
      "start_offset" : 28,
      "end_offset" : 29,
      "type" : "<IDEOGRAPHIC>",
      "position" : 13
    },
    {
      "token" : "一",
      "start_offset" : 29,
      "end_offset" : 30,
      "type" : "<IDEOGRAPHIC>",
      "position" : 14
    },
    {
      "token" : "級",
      "start_offset" : 30,
      "end_offset" : 31,
      "type" : "<IDEOGRAPHIC>",
      "position" : 15
    },
    {
      "token" : "節",
      "start_offset" : 31,
      "end_offset" : 32,
      "type" : "<IDEOGRAPHIC>",
      "position" : 16
    },
    {
      "token" : "能",
      "start_offset" : 32,
      "end_offset" : 33,
      "type" : "<IDEOGRAPHIC>",
      "position" : 17
    },
    {
      "token" : "家",
      "start_offset" : 33,
      "end_offset" : 34,
      "type" : "<IDEOGRAPHIC>",
      "position" : 18
    },
    {
      "token" : "用",
      "start_offset" : 34,
      "end_offset" : 35,
      "type" : "<IDEOGRAPHIC>",
      "position" : 19
    },
    {
      "token" : "官",
      "start_offset" : 35,
      "end_offset" : 36,
      "type" : "<IDEOGRAPHIC>",
      "position" : 20
    },
    {
      "token" : "方",
      "start_offset" : 36,
      "end_offset" : 37,
      "type" : "<IDEOGRAPHIC>",
      "position" : 21
    },
    {
      "token" : "冰",
      "start_offset" : 37,
      "end_offset" : 38,
      "type" : "<IDEOGRAPHIC>",
      "position" : 22
    },
    {
      "token" : "箱",
      "start_offset" : 38,
      "end_offset" : 39,
      "type" : "<IDEOGRAPHIC>",
      "position" : 23
    }
  ]
}      

ik_max_word

GET _analyze?pretty
  {
    "analyzer": "ik_max_word",
    "text": "Haier/海爾 BCD-470WDPG十字對開門風冷變頻一級節能家用官方冰箱"
  }      
{
  "tokens" : [
    {
      "token" : "haier",
      "start_offset" : 0,
      "end_offset" : 5,
      "type" : "ENGLISH",
      "position" : 0
    },
    {
      "token" : "海爾",
      "start_offset" : 6,
      "end_offset" : 8,
      "type" : "CN_WORD",
      "position" : 1
    },
    {
      "token" : "bcd-470wdpg",
      "start_offset" : 9,
      "end_offset" : 20,
      "type" : "LETTER",
      "position" : 2
    },
    {
      "token" : "bcd",
      "start_offset" : 9,
      "end_offset" : 12,
      "type" : "ENGLISH",
      "position" : 3
    },
    {
      "token" : "470",
      "start_offset" : 13,
      "end_offset" : 16,
      "type" : "ARABIC",
      "position" : 4
    },
    {
      "token" : "wdpg",
      "start_offset" : 16,
      "end_offset" : 20,
      "type" : "ENGLISH",
      "position" : 5
    },
    {
      "token" : "十字",
      "start_offset" : 20,
      "end_offset" : 22,
      "type" : "CN_WORD",
      "position" : 6
    },
    {
      "token" : "十",
      "start_offset" : 20,
      "end_offset" : 21,
      "type" : "TYPE_CNUM",
      "position" : 7
    },
    {
      "token" : "字",
      "start_offset" : 21,
      "end_offset" : 22,
      "type" : "COUNT",
      "position" : 8
    },
    {
      "token" : "對開",
      "start_offset" : 22,
      "end_offset" : 24,
      "type" : "CN_WORD",
      "position" : 9
    },
    {
      "token" : "開門",
      "start_offset" : 23,
      "end_offset" : 25,
      "type" : "CN_WORD",
      "position" : 10
    },
    {
      "token" : "門風",
      "start_offset" : 24,
      "end_offset" : 26,
      "type" : "CN_WORD",
      "position" : 11
    },
    {
      "token" : "風冷",
      "start_offset" : 25,
      "end_offset" : 27,
      "type" : "CN_WORD",
      "position" : 12
    },
    {
      "token" : "變頻",
      "start_offset" : 27,
      "end_offset" : 29,
      "type" : "CN_WORD",
      "position" : 13
    },
    {
      "token" : "一級",
      "start_offset" : 29,
      "end_offset" : 31,
      "type" : "CN_WORD",
      "position" : 14
    },
    {
      "token" : "一",
      "start_offset" : 29,
      "end_offset" : 30,
      "type" : "TYPE_CNUM",
      "position" : 15
    },
    {
      "token" : "級",
      "start_offset" : 30,
      "end_offset" : 31,
      "type" : "COUNT",
      "position" : 16
    },
    {
      "token" : "節能",
      "start_offset" : 31,
      "end_offset" : 33,
      "type" : "CN_WORD",
      "position" : 17
    },
    {
      "token" : "家用",
      "start_offset" : 33,
      "end_offset" : 35,
      "type" : "CN_WORD",
      "position" : 18
    },
    {
      "token" : "官方",
      "start_offset" : 35,
      "end_offset" : 37,
      "type" : "CN_WORD",
      "position" : 19
    },
    {
      "token" : "冰箱",
      "start_offset" : 37,
      "end_offset" : 39,
      "type" : "CN_WORD",
      "position" : 20
    }
  ]
}      
GET _analyze?pretty
  {
    "analyzer": "ik_smart",
    "text": "Haier/海爾 BCD-470WDPG十字對開門風冷變頻一級節能家用官方冰箱"
  }      
{
  "tokens" : [
    {
      "token" : "haier",
      "start_offset" : 0,
      "end_offset" : 5,
      "type" : "ENGLISH",
      "position" : 0
    },
    {
      "token" : "海爾",
      "start_offset" : 6,
      "end_offset" : 8,
      "type" : "CN_WORD",
      "position" : 1
    },
    {
      "token" : "bcd-470wdpg",
      "start_offset" : 9,
      "end_offset" : 20,
      "type" : "LETTER",
      "position" : 2
    },
    {
      "token" : "十字",
      "start_offset" : 20,
      "end_offset" : 22,
      "type" : "CN_WORD",
      "position" : 3
    },
    {
      "token" : "對開",
      "start_offset" : 22,
      "end_offset" : 24,
      "type" : "CN_WORD",
      "position" : 4
    },
    {
      "token" : "門",
      "start_offset" : 24,
      "end_offset" : 25,
      "type" : "CN_CHAR",
      "position" : 5
    },
    {
      "token" : "風冷",
      "start_offset" : 25,
      "end_offset" : 27,
      "type" : "CN_WORD",
      "position" : 6
    },
    {
      "token" : "變頻",
      "start_offset" : 27,
      "end_offset" : 29,
      "type" : "CN_WORD",
      "position" : 7
    },
    {
      "token" : "一級",
      "start_offset" : 29,
      "end_offset" : 31,
      "type" : "CN_WORD",
      "position" : 8
    },
    {
      "token" : "節能",
      "start_offset" : 31,
      "end_offset" : 33,
      "type" : "CN_WORD",
      "position" : 9
    },
    {
      "token" : "家用",
      "start_offset" : 33,
      "end_offset" : 35,
      "type" : "CN_WORD",
      "position" : 10
    },
    {
      "token" : "官方",
      "start_offset" : 35,
      "end_offset" : 37,
      "type" : "CN_WORD",
      "position" : 11
    },
    {
      "token" : "冰箱",
      "start_offset" : 37,
      "end_offset" : 39,
      "type" : "CN_WORD",
      "position" : 12
    }
  ]
}      

ik_max_word:會将文本做最細粒度的拆分,比如會将“中華人民共和國國歌”拆分為“中華人民共和國,中華人民,中華,華人,人民共和國,人民,人,民,共和國,共和,和,國國,國歌”,會窮盡各種可能的組合。

ik_smart:會做最粗粒度的拆分,比如會将“中華人民共和國國歌”拆分為“中華人民共和國,國歌”。

分詞使用

使用分詞後 會将資料以反向索引的方法存儲 實作模糊查詢

建立索引并使用ik分詞儲存 

PUT my_index
{
  "mappings": {
      "properties": {
        "title": {
          "type": "text",
          "analyzer": "ik_max_word" //使用ik分詞儲存
        },
        "name": {
          "type": "text"
        },
        "age": {
          "type": "integer"
        },
        "created": {
          "type": "date",
          "format": "strict_date_optional_time||epoch_millis"
        }
      }
    }
}      

索引插入文檔

POST /my_index3/_bulk
{ "index": { "_id": 1 }}
{ "title" : "Haier/海爾 BCD-470WDPG十字對開門風冷變頻一級節能家用官方冰箱", "name" : "王二" , "age": 10, "created": 20190101 }
{ "index": { "_id": 2 }}
{ "title" : "【爆款秒殺】海爾冰箱三門家用小型節能省電雙門電冰箱官方旗艦店", "name" : "王二" , "age": 10, "created": 20190101 }
{ "index": { "_id": 3}}
{ "title" : "Panasonic/松下 NR-TC28WS1-N 風冷無霜家用抑菌三門小體積冰箱", "name" : "王二" , "age": 10, "created": 20190101 }
{ "index": { "_id": 4}}
{ "title" : "小米電視4A50英寸4K高清智能網絡平闆液晶屏家電視機家電官方旗艦", "name" : "王二" , "age": 10, "created": 20190101 }
{ "index": { "_id": 5}}
{ "title" : "創維40X6 40英寸高清電視機智能網絡wifi平闆液晶屏家用彩電32 43", "name" : "王二" , "age": 10, "created": 20190101 }
{ "index": { "_id": 6}}
{ "title" : "Changhong/長虹 50D4P 50英寸超薄無邊全面屏4K超高清智能電視機", "name" : "王二" , "age": 10, "created": 20190101 }      

檢視分詞

GET _analyze?pretty
  {
    "analyzer": "ik_max_word",
    "text": "Haier/海爾 BCD-470WDPG十字對開門風冷變頻一級節能家用官方冰箱"
  }      

通過條件搜尋

GET /my_index/_search?pretty
{
    "query": {
         "match": {"title": "對"}
     }
}      

會發現隻有分詞的條件才能被查詢

自定義分詞器

參考:

https://blog.csdn.net/Barbarousgrowth_yp/article/details/80242811 https://blog.csdn.net/zhou870498/article/details/80501972

繼續閱讀