summary
In order to verify that the current cluster often has the problem of index timeout and request rejection, the online cluster environment and index settings are simulated, and the test data is randomly generated through the stress testing tool, and the current 850 sharded indexes, as well as the indexes after halving, and the writes of smaller shard indexes, using different concurrency and different batch sizes to observe the throughput of the indexes, and record the accumulation of the write queue, which is used to analyze the impact of the number of shards and batches on the writes, so as to determine the subsequent optimization scheme.
Stress test scenario
Elasticsearch version v7.7.1, with a total of 57 nodes, including 3 independent masters, 3 coordinators, and 31GB JVM.
Stress testing process
Single index 850 shards
Index definition
PUT idx-xxxx-xxxxxx
{
"aliases" : {
"alias-xxxx-xxxxxx" : { }
},
"mappings" : {
"dynamic" : "strict",
"_routing" : {
"required" : true
},
"_source" : {
"excludes" : [
"isExtract*",
"batchNo"
]
},
"properties" : {
"addxxxx" : {
"type" : "text",
"term_vector" : "with_positions_offsets"
},
"clxxxx" : {
"type" : "byte"
},
"contxxxx" : {
"type" : "text",
"boost" : 4.0,
"term_vector" : "with_positions_offsets"
},
"conxxxx" : {
"type" : "keyword",
"doc_values" : false
},
"con1xxxx" : {
"type" : "text",
"boost" : 16.0,
"term_vector" : "with_positions_offsets",
"fields" : {
"keyword" : {
"type" : "keyword",
"normalizer" : "keyword_normalizer"
}
},
"analyzer" : "name_analyzer",
"search_analyzer" : "keyword_analyzer"
},
"contSxxxx" : {
"type" : "long",
"index" : false,
"doc_values" : false
},
"contSxxxxx" : {
"type" : "keyword",
"doc_values" : false
},
"contTxxxx" : {
"type" : "short"
},
"crtxxxx" : {
"type" : "date",
"ignore_malformed" : true,
"format" : "yyyyMMddHHmmss"
},
"duration" : {
"type" : "long",
"index" : false,
"doc_values" : false
},
"largeTxxxx" : {
"type" : "keyword",
"boost" : 8.0,
"index" : false,
"doc_values" : false
},
"md5" : {
"type" : "keyword",
"index" : false,
"doc_values" : false
},
"orderxxxx" : {
"type" : "alias",
"path" : "contName.keyword"
},
"ownxxxxxx" : {
"type" : "keyword",
"doc_values" : false
},
"ownxxxxxxxxxx" : {
"type" : "keyword",
"doc_values" : false
},
"ownxxxxxxxxxxx" : {
"type" : "keyword",
"doc_values" : false
},
"ownxxxxxxxxxxx" : {
"type" : "keyword",
"doc_values" : false
},
"parenxxxxxxxxxx" : {
"type" : "keyword"
},
"pathxx" : {
"type" : "text",
"boost" : 8.0,
"term_vector" : "with_positions_offsets",
"fields" : {
"keyword" : {
"type" : "keyword"
}
},
"analyzer" : "path_analyzer"
},
"presexxxxx" : {
"type" : "keyword",
"boost" : 8.0,
"index" : false,
"doc_values" : false
},
"presexxxxx" : {
"type" : "keyword",
"boost" : 8.0,
"index" : false,
"doc_values" : false
},
"presxxxxxx" : {
"type" : "keyword",
"boost" : 8.0,
"index" : false,
"doc_values" : false
},
"prixxxxxx" : {
"type" : "short",
"index" : false
},
"search_xxxxxx" : {
"type" : "alias",
"path" : "contName"
},
"servixxxxxx" : {
"type" : "byte"
},
"shotxxxxxx" : {
"type" : "date",
"ignore_malformed" : true,
"format" : "yyyyMMddHHmmss"
},
"xxxxxxlThuxxxxxx" : {
"type" : "keyword",
"boost" : 8.0,
"index" : false,
"doc_values" : false
},
"tagxxxxxx" : {
"type" : "text",
"term_vector" : "with_positions_offsets"
},
"thumxxxxxx" : {
"type" : "keyword",
"boost" : 8.0,
"index" : false,
"doc_values" : false
},
"xxxxxxpdxxxxxx" : {
"type" : "date",
"ignore_malformed" : true,
"format" : "yyyyMMddHHmmss"
},
"xxxxxxderAcxxxxxx" : {
"type" : "keyword",
"doc_values" : false
},
"xxxxxxerAccouxxxxxx" : {
"type" : "keyword",
"doc_values" : false
},
"xxxxxxerxxxxxxID" : {
"type" : "keyword",
"doc_values" : false
},
"xxxxxxderNxxxxxx" : {
"type" : "keyword",
"doc_values" : false
}
}
},
"settings" : {
"index" : {
"max_ngram_diff" : "50",
"refresh_interval" : "1s",
"number_of_shards" : "850",
"analysis" : {
"normalizer" : {
"keyword_normalizer" : {
"filter" : [
"lowercase"
],
"type" : "custom"
}
},
"analyzer" : {
"keyword_analyzer" : {
"filter" : [
"lowercase"
],
"type" : "custom",
"tokenizer" : "keyword"
},
"name_analyzer" : {
"filter" : [
"lowercase"
],
"type" : "custom",
"tokenizer" : "name_tokenizer"
},
"path_analyzer" : {
"filter" : [
"lowercase"
],
"type" : "custom",
"tokenizer" : "path_tokenizer"
}
},
"tokenizer" : {
"name_tokenizer" : {
"type" : "ngram",
"min_gram" : "1",
"max_gram" : "5"
},
"path_tokenizer" : {
"pattern" : "/",
"type" : "pattern"
}
}
},
"number_of_replicas" : "1"
}
}
}
View sample data
POST idx-owncloud-img/_doc/1?routing=1
{
"ownerxxxxxx" : "002#######0oV",
"serxxxxxx" : 1,
"tagxxxxxx" : "",
"contxxxxxx" : "",
"xxxxxxAccoxxxxxxe" : "1",
"presxxxxxx" : "",
"conxxxxxx" : "jpg",
"xxxxxxerBxxxxxx" : "6#######573",
"ownerxxxxxxx" : "13#######62",
"presxxxxxxL" : "",
"duxxxxxx" : 0,
"paxxxxxx" : "00##########################################043",
"crtxxxxxx" : "20#######45",
"pxxxxxxtCatxxxxxx" : "001############################043",
"sxxxxxxThumxxxxxx" : "http://downl#################################################961",
"uxxxxxxerAxxxxxxt" : "1##############2",
"uxxxxxxderAccoxxxxxxe" : "1",
"uxxxxxxderxxxxxxID" : "0#####################V",
"lxxxxxxhumxxxxxxl" : "http://d###################################D961",
"thxxxxxxl" : "http://do###############################################################61",
"axxxxxxss" : "",
"uxxxxxxm" : "20##############8",
"cxxxxxx" : 3,
"coxxxxxx" : 1,
"prxxxxxx" : 10,
"coxxxxxx" : "0###################################cm",
"co2xxxxxx" : 5##############8,
"shoxxxxxx" : "20##############4",
"contxxxxxx" : "mm##############g",
"presxxxxxx" : "",
"oxxxxxxBmpxxxxxx" : "6#######3",
"md5" : "7##############1E"
}
Review the loadgen configuration
root@loadgen:/opt/loadgen# cat loadgen.yml
statsd:
enabled: false
host: 192.168.3.98
port: 8125
namespace: loadgen.
variables:
- name: ip
type: file
path: dict/ip.txt
- name: message
type: file
path: dict/nginx.log
# - name: user
# type: file
# path: dict/user.txt
- name: id
type: sequence
- name: uuid
type: uuid
- name: now_local
type: now_local
- name: now_utc
type: now_utc
- name: now_unix
type: now_unix
- name: suffix
type: range
from: 12
to: 12
- name: bool
type: range
from: 0
to: 1
requests:
- request:
method: POST
runtime_variables:
batch_no: id
runtime_body_line_variables:
routing_no: uuid
basic_auth:
username: elastic
password: ####
url: https://xxx.elasticsearch.xxx.cn:9243/_bulk
body_repeat_times: 50
body: "{ \"create\" : { \"_index\" : \"idx-xxxxxx-xxxxxx\",\"_type\":\"_doc\", \"_id\" : \"$[[uuid]]\" , \"routing\" : \"$[[routing_no]]\" } }\n{ \"ownerxxxxxx\" : \"0011WsjCK0oV\", \"servxxxxxx\" : $[[bool]], \"tagxxxxxx\" : \"\", \"contxxxxxx\" : \"\", \"ownexxxxxxunxxxxxx\" : \"$[[bool]]\", \"prxxxxxxentLxxxxxx\" : \"\", \"conxxxxxx\" : \"jpg\", \"uxxxxxxexxxxxxID\" : \"$[[id]]\", \"owxxxxxxccxxxxxxt\" : \"$[[routing_no]]\", \"prxxxxxxtUxxxxxxL\" : \"\", \"durxxxxxxn\" : 0, \"paxxxxxx\" : \"00019700101000000001/0011WsjCK0oV00019700101000000043\", \"crxxxxxx\" : \"$[[id]]\", \"paxxxxxxntxxxxxxogIxxxxxx\" : \"0011WsjCK0oV00019700101000000043\", \"sxxxxxxThumxxxxxx\" : \"http://xxx.xxx.cn:80/storageWeb/servlet/GetFileByURLServlet?root=/mnt/wfs133&fileid=KB1af35f100578d655b2cfbd7edd2cb50e.jpg&ct=1&type=0&code=80B0EAB7F429F1A32F76EB895F5FF4DE1853D254604FAB67A7C33FDF92BE7220&exp=315&account=MTM2MzgzMTU1NjI=&p=0&ui=0011WsjCK0oV&ci=0011WsjCK0oV06320210812125345tcm&userSiteId=usersite-s&cn=mmexport162592513503...&oprChannel=10000000&dom=D961\", \"xxxxxxderAxxxxxxnt\" : \"$[[routing_no]]\", \"upxxxxxxerAcxxxxxxtype\" : \"$[[bool]]\", \"uploaderNDUserID\" : \"$[[uuid]]\", \"largeThumbnail\" : \"http://xxx.xxx.cn:80/storageWeb/servlet/GetFileByURLServlet?root=/mnt/wfs133&fileid=KB1af35f100578d655b2cfbd7edd2cb50e.jpg&ct=1&type=1&code=80B0EAB7F429F1A32F76EB895F5FF4DE1853D254604FAB67A7C33FDF92BE7220&exp=315&account=MTM2MzgzMTU1NjI=&p=0&ui=0011WsjCK0oV&ci=0011WsjCK0oV06320210812125345tcm&userSiteId=usersite-s&cn=mmexport162592513503...&oprChannel=10000000&dom=D961\", \"xxxxxxil\" : \"http://download.xxx.xxx.com:80/storageWeb/servlet/GetFileByURLServlet?root=/mnt/wfs133&fileid=KB1af35f100578d655b2cfbd7edd2cb50e.jpg&ct=1&type=2&code=80B0EAB7F429F1A32F76EB895F5FF4DE1853D254604FAB67A7C33FDF92BE7220&exp=315&account=MTM2MzgzMTU1NjI=&p=0&ui=0011WsjCK0oV&ci=0011WsjCK0oV06320210812125345tcm&userSiteId=usersite-s&cn=mmexport162592513503...&oprChannel=10000000&dom=D961\", \"adxxxxxx\" : \"\", \"upxxxxxx\" : \"$[[now_unix]]\", \"cxxxxxx\" : 3, \"contxxxxxxe\" : $[[bool]], \"prixxxxxx\" : 10, \"conxxxxxx\" : \"0011WsjCK0oV06320210812125345tcm\", \"contxxxxxx\" : $[[id]], \"shoxxxxxx\" : \"$[[id]]\", \"contxxxxxxe\" : \"mmexport1625925135032.jpg\", \"prxxxxxxtHxxxxxx\" : \"\", \"oxxxxxxrBmxxxxxxID\" : \"$[[id]]\", \"md5\" : \"$[[uuid]]\" }\n"
Run the test
Enable gzip traffic compression and perform stress tests:
root@loadgen:/opt/loadgen# ./loadgen-linux-amd64 -config loadgen.yml -d 6000 -c 100 -compress
1 replica 100 concurrent
0 replicas 100 concurrent
0 replicas 200 concurrent
The write queue already has a lot of accumulation and rejection:
1 replica 200 concurrent
1 replica 400 concurrent
1 replica 800 concurrent
1 replica batch 500 concurrently 100
1 replica batch 2000 concurrently 100
1 replica batch 5000 concurrency 100
1 replica batch 5000 concurrently 200
Single index 425 shards
Index definition
PUT idx-xxxxxx-xxxxxx-425
{
"aliases" : {
"alias-xxxxxx-xxxxxx" : { }
},
"mappings" : {
"dynamic" : "strict",
"_routing" : {
"required" : true
},
"_source" : {
"excludes" : [
"isExtract*",
"batchNo"
]
},
"properties" : {
"addxxxxxx" : {
"type" : "text",
"term_vector" : "with_positions_offsets"
},
"cxxxxxx" : {
"type" : "byte"
},
"coxxxxxxc" : {
"type" : "text",
"boost" : 4.0,
"term_vector" : "with_positions_offsets"
},
"coxxxxxx" : {
"type" : "keyword",
"doc_values" : false
},
"conxxxxxxe" : {
"type" : "text",
"boost" : 16.0,
"term_vector" : "with_positions_offsets",
"fields" : {
"keyword" : {
"type" : "keyword",
"normalizer" : "keyword_normalizer"
}
},
"analyzer" : "name_analyzer",
"search_analyzer" : "keyword_analyzer"
},
"coxxxxxxze" : {
"type" : "long",
"index" : false,
"doc_values" : false
},
"conxxxxxxfix" : {
"type" : "keyword",
"doc_values" : false
},
"coxxxxxxpe" : {
"type" : "short"
},
"cxxxxxxm" : {
"type" : "date",
"ignore_malformed" : true,
"format" : "yyyyMMddHHmmss"
},
"duxxxxxxon" : {
"type" : "long",
"index" : false,
"doc_values" : false
},
"laxxxxxxbnail" : {
"type" : "keyword",
"boost" : 8.0,
"index" : false,
"doc_values" : false
},
"md5" : {
"type" : "keyword",
"index" : false,
"doc_values" : false
},
"ordxxxxxxNamxxxxxx" : {
"type" : "alias",
"path" : "contName.keyword"
},
"oxxxxxxccoxxxxxxt" : {
"type" : "keyword",
"doc_values" : false
},
"owxxxxxxcounxxxxxxpe" : {
"type" : "keyword",
"doc_values" : false
},
"owxxxxxxpUsxxxxxxD" : {
"type" : "keyword",
"doc_values" : false
},
"oxxxxxxDUsexxxxxxD" : {
"type" : "keyword",
"doc_values" : false
},
"pxxxxxxtalxxxxxxD" : {
"type" : "keyword"
},
"patxxxxxx" : {
"type" : "text",
"boost" : 8.0,
"term_vector" : "with_positions_offsets",
"fields" : {
"keyword" : {
"type" : "keyword"
}
},
"analyzer" : "path_analyzer"
},
"prxxxxxxntHxxxxxx" : {
"type" : "keyword",
"boost" : 8.0,
"index" : false,
"doc_values" : false
},
"prxxxxxxntLxxxxxx" : {
"type" : "keyword",
"boost" : 8.0,
"index" : false,
"doc_values" : false
},
"prxxxxxxURxxxxxx" : {
"type" : "keyword",
"boost" : 8.0,
"index" : false,
"doc_values" : false
},
"pxxxxxxity" : {
"type" : "short",
"index" : false
},
"sxxxxxxch_nxxxxxxe" : {
"type" : "alias",
"path" : "contName"
},
"sexxxxxxeTxxxxxxe" : {
"type" : "byte"
},
"sxxxxxxTm" : {
"type" : "date",
"ignore_malformed" : true,
"format" : "yyyyMMddHHmmss"
},
"smxxxxxxThuxxxxxxl" : {
"type" : "keyword",
"boost" : 8.0,
"index" : false,
"doc_values" : false
},
"taxxxxxxa" : {
"type" : "text",
"term_vector" : "with_positions_offsets"
},
"txxxxxxnaxxxxxx" : {
"type" : "keyword",
"boost" : 8.0,
"index" : false,
"doc_values" : false
},
"uxxxxxxm" : {
"type" : "date",
"ignore_malformed" : true,
"format" : "yyyyMMddHHmmss"
},
"upxxxxxxdexxxxxxount" : {
"type" : "keyword",
"doc_values" : false
},
"upxxxxxxrAcxxxxxxpe" : {
"type" : "keyword",
"doc_values" : false
},
"upxxxxxxmpUsxxxxxx" : {
"type" : "keyword",
"doc_values" : false
},
"uxxxxxxerNDxxxxxxD" : {
"type" : "keyword",
"doc_values" : false
}
}
},
"settings" : {
"index" : {
"max_ngram_diff" : "50",
"refresh_interval" : "1s",
"number_of_shards" : "425",
"analysis" : {
"normalizer" : {
"keyword_normalizer" : {
"filter" : [
"lowercase"
],
"type" : "custom"
}
},
"analyzer" : {
"keyword_analyzer" : {
"filter" : [
"lowercase"
],
"type" : "custom",
"tokenizer" : "keyword"
},
"name_analyzer" : {
"filter" : [
"lowercase"
],
"type" : "custom",
"tokenizer" : "name_tokenizer"
},
"path_analyzer" : {
"filter" : [
"lowercase"
],
"type" : "custom",
"tokenizer" : "path_tokenizer"
}
},
"tokenizer" : {
"name_tokenizer" : {
"type" : "ngram",
"min_gram" : "1",
"max_gram" : "5"
},
"path_tokenizer" : {
"pattern" : "/",
"type" : "pattern"
}
}
},
"number_of_replicas" : "1"
}
}
}
1 replica batch 50 concurrency 100
1 replica batch, 50, concurrent, 200
1 replica batch 50 concurrency 400
1 replica batch 50 concurrently 800
1 replica batch 500 concurrently 100
1 replica batch 2000 concurrently 100
1 replica batch 5000 concurrency 100
Single index with 50 shards
1 replica batch 50 concurrency 100
1 replica batch 500 concurrently 100
1 replica batch 1000 concurrently 100
1 replica batch 5000 concurrency 100
Go gateway single index 425 shards
1 replica batch 50 concurrently 400>200
1 replica batch 500 concurrently 100
1 replica batch 500 concurrently 200
1 replica batch 500 concurrent 400
1 replica batch 5000 concurrency 100
1 replica batch 5000 concurrently 200
1 replica batch 5000 concurrently 400
Go gateway single index 850 shards
1 replica batch 50 concurrency 400
1 replica batch 500 concurrent 400
1 replica batch 5000 concurrently 400
Stress test results
Number of indexes | Number of shards | Number of Aux Copies | Batch size | Stress test concurrency | Average Write Throughput (EPS) |
1 | 850 | 1 | 50 | 100 | 10,000 |
1 | 850 | 50 | 100 | 30,000 | |
1 | 850 | 50 | 200 | 40,000 | |
1 | 850 | 1 | 50 | 200 | 18,000 |
1 | 850 | 1 | 50 | 400 | 27,500 |
1 | 850 | 1 | 50 | 800 | 29,700 |
1 | 850 | 1 | 500 | 100 | 30,187 |
1 | 850 | 1 | 2000 | 100 | 68,000 |
1 | 850 | 1 | 5000 | 100 | 98,915 |
1 | 850 | 1 | 5000 | 200 | 78,462 |
1 | 425 | 1 | 50 | 100 | 12,695 |
1 | 425 | 1 | 500 | 100 | 46818 |
1 | 425 | 1 | 2000 | 100 | 100,000 |
1 | 425 | 1 | 5000 | 100 | 130,000 |
1 | 50 | 1 | 50 | 100 | 32,987 |
1 | 50 | 1 | 500 | 100 | 96,207 |
1 | 50 | 1 | 1000 | 100 | 147,719 |
1 | 50 | 1 | 5000 | 100 | 156,961 |
Follow the gateway node asynchronous merge mode:
Number of indexes | Number of shards | Number of Aux Copies | Batch size | Stress test concurrency | Average Write Throughput (EPS) |
1 | 425 | 1 | 50 | 100 | 500 |
1 | 425 | 1 | 50 | 200 | 1,000 |
1 | 425 | 1 | 50 | 400 | 2,000 |
1 | 425 | 1 | 500 | 100 | 4,800 |
1 | 425 | 1 | 500 | 200 | 9,350 |
1 | 425 | 1 | 500 | 400 | 17,000 |
1 | 425 | 1 | 5000 | 100 | 50,000 |
1 | 425 | 1 | 5000 | 200 | 100,000 |
1 | 425 | 1 | 5000 | 400 | 175,000 |
1 | 850 | 1 | 50 | 400 | 2000 |
1 | 850 | 1 | 500 | 400 | 18,800 |
1 | 850 | 1 | 5000 | 400 | 137,000 |
conclusion
Large shard indexes, 850 or 425, may fill the thread pool and request rejections in the case of concurrency even if there is only 100, and it is more likely to occur when the number of documents in a single batch is relatively small. For indexes of the same format, in the case of 50 shards, the throughput of the index is twice that of 425 shards and three times that of 850 shards, and the thread pool is basically not stacked, or the accumulation is processed quickly. The higher the number of documents in a single request, the more efficient the write. In some scenarios, although the index shards are routed, but the ultra-large shard index has serious forwarding efficiency problems, it is recommended to divide the index according to the business dimension or the current routing dimension, split the super-large index into several sub-indexes, and the number of shards of a single index should not exceed 20.