laitimes

Stress testing reports on different shard settings in Elasticsearch

author:INFINI Labs

summary

In order to verify that the current cluster often has the problem of index timeout and request rejection, the online cluster environment and index settings are simulated, and the test data is randomly generated through the stress testing tool, and the current 850 sharded indexes, as well as the indexes after halving, and the writes of smaller shard indexes, using different concurrency and different batch sizes to observe the throughput of the indexes, and record the accumulation of the write queue, which is used to analyze the impact of the number of shards and batches on the writes, so as to determine the subsequent optimization scheme.

Stress test scenario

Elasticsearch version v7.7.1, with a total of 57 nodes, including 3 independent masters, 3 coordinators, and 31GB JVM.

Stress testing process

Single index 850 shards

Index definition

PUT idx-xxxx-xxxxxx
{
    "aliases" : {
      "alias-xxxx-xxxxxx" : { }
    },
    "mappings" : {
      "dynamic" : "strict",
      "_routing" : {
        "required" : true
      },
      "_source" : {
        "excludes" : [
          "isExtract*",
          "batchNo"
        ]
      },
      "properties" : {
        "addxxxx" : {
          "type" : "text",
          "term_vector" : "with_positions_offsets"
        },
        "clxxxx" : {
          "type" : "byte"
        },
        "contxxxx" : {
          "type" : "text",
          "boost" : 4.0,
          "term_vector" : "with_positions_offsets"
        },
        "conxxxx" : {
          "type" : "keyword",
          "doc_values" : false
        },
        "con1xxxx" : {
          "type" : "text",
          "boost" : 16.0,
          "term_vector" : "with_positions_offsets",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "normalizer" : "keyword_normalizer"
            }
          },
          "analyzer" : "name_analyzer",
          "search_analyzer" : "keyword_analyzer"
        },
        "contSxxxx" : {
          "type" : "long",
          "index" : false,
          "doc_values" : false
        },
        "contSxxxxx" : {
          "type" : "keyword",
          "doc_values" : false
        },
        "contTxxxx" : {
          "type" : "short"
        },
        "crtxxxx" : {
          "type" : "date",
          "ignore_malformed" : true,
          "format" : "yyyyMMddHHmmss"
        },
        "duration" : {
          "type" : "long",
          "index" : false,
          "doc_values" : false
        },
        "largeTxxxx" : {
          "type" : "keyword",
          "boost" : 8.0,
          "index" : false,
          "doc_values" : false
        },
        "md5" : {
          "type" : "keyword",
          "index" : false,
          "doc_values" : false
        },
        "orderxxxx" : {
          "type" : "alias",
          "path" : "contName.keyword"
        },
        "ownxxxxxx" : {
          "type" : "keyword",
          "doc_values" : false
        },
        "ownxxxxxxxxxx" : {
          "type" : "keyword",
          "doc_values" : false
        },
        "ownxxxxxxxxxxx" : {
          "type" : "keyword",
          "doc_values" : false
        },
        "ownxxxxxxxxxxx" : {
          "type" : "keyword",
          "doc_values" : false
        },
        "parenxxxxxxxxxx" : {
          "type" : "keyword"
        },
        "pathxx" : {
          "type" : "text",
          "boost" : 8.0,
          "term_vector" : "with_positions_offsets",
          "fields" : {
            "keyword" : {
              "type" : "keyword"
            }
          },
          "analyzer" : "path_analyzer"
        },
        "presexxxxx" : {
          "type" : "keyword",
          "boost" : 8.0,
          "index" : false,
          "doc_values" : false
        },
        "presexxxxx" : {
          "type" : "keyword",
          "boost" : 8.0,
          "index" : false,
          "doc_values" : false
        },
        "presxxxxxx" : {
          "type" : "keyword",
          "boost" : 8.0,
          "index" : false,
          "doc_values" : false
        },
        "prixxxxxx" : {
          "type" : "short",
          "index" : false
        },
        "search_xxxxxx" : {
          "type" : "alias",
          "path" : "contName"
        },
        "servixxxxxx" : {
          "type" : "byte"
        },
        "shotxxxxxx" : {
          "type" : "date",
          "ignore_malformed" : true,
          "format" : "yyyyMMddHHmmss"
        },
        "xxxxxxlThuxxxxxx" : {
          "type" : "keyword",
          "boost" : 8.0,
          "index" : false,
          "doc_values" : false
        },
        "tagxxxxxx" : {
          "type" : "text",
          "term_vector" : "with_positions_offsets"
        },
        "thumxxxxxx" : {
          "type" : "keyword",
          "boost" : 8.0,
          "index" : false,
          "doc_values" : false
        },
        "xxxxxxpdxxxxxx" : {
          "type" : "date",
          "ignore_malformed" : true,
          "format" : "yyyyMMddHHmmss"
        },
        "xxxxxxderAcxxxxxx" : {
          "type" : "keyword",
          "doc_values" : false
        },
        "xxxxxxerAccouxxxxxx" : {
          "type" : "keyword",
          "doc_values" : false
        },
        "xxxxxxerxxxxxxID" : {
          "type" : "keyword",
          "doc_values" : false
        },
        "xxxxxxderNxxxxxx" : {
          "type" : "keyword",
          "doc_values" : false
        }
      }
    },
    "settings" : {
      "index" : {
        "max_ngram_diff" : "50",
        "refresh_interval" : "1s",
        "number_of_shards" : "850",

        "analysis" : {
          "normalizer" : {
            "keyword_normalizer" : {
              "filter" : [
                "lowercase"
              ],
              "type" : "custom"
            }
          },
          "analyzer" : {
            "keyword_analyzer" : {
              "filter" : [
                "lowercase"
              ],
              "type" : "custom",
              "tokenizer" : "keyword"
            },
            "name_analyzer" : {
              "filter" : [
                "lowercase"
              ],
              "type" : "custom",
              "tokenizer" : "name_tokenizer"
            },
            "path_analyzer" : {
              "filter" : [
                "lowercase"
              ],
              "type" : "custom",
              "tokenizer" : "path_tokenizer"
            }
          },
          "tokenizer" : {
            "name_tokenizer" : {
              "type" : "ngram",
              "min_gram" : "1",
              "max_gram" : "5"
            },
            "path_tokenizer" : {
              "pattern" : "/",
              "type" : "pattern"
            }
          }
        },
        "number_of_replicas" : "1"

      }
    }
  }
           

View sample data

POST idx-owncloud-img/_doc/1?routing=1
{
  "ownerxxxxxx" : "002#######0oV",
  "serxxxxxx" : 1,
  "tagxxxxxx" : "",
  "contxxxxxx" : "",
  "xxxxxxAccoxxxxxxe" : "1",
  "presxxxxxx" : "",
  "conxxxxxx" : "jpg",
  "xxxxxxerBxxxxxx" : "6#######573",
  "ownerxxxxxxx" : "13#######62",
  "presxxxxxxL" : "",
  "duxxxxxx" : 0,
  "paxxxxxx" : "00##########################################043",
  "crtxxxxxx" : "20#######45",
  "pxxxxxxtCatxxxxxx" : "001############################043",
  "sxxxxxxThumxxxxxx" : "http://downl#################################################961",
  "uxxxxxxerAxxxxxxt" : "1##############2",
  "uxxxxxxderAccoxxxxxxe" : "1",
  "uxxxxxxderxxxxxxID" : "0#####################V",
  "lxxxxxxhumxxxxxxl" : "http://d###################################D961",
  "thxxxxxxl" : "http://do###############################################################61",
  "axxxxxxss" : "",
  "uxxxxxxm" : "20##############8",
  "cxxxxxx" : 3,
  "coxxxxxx" : 1,
  "prxxxxxx" : 10,
  "coxxxxxx" : "0###################################cm",
  "co2xxxxxx" : 5##############8,
  "shoxxxxxx" : "20##############4",
  "contxxxxxx" : "mm##############g",
  "presxxxxxx" : "",
  "oxxxxxxBmpxxxxxx" : "6#######3",
  "md5" : "7##############1E"
}
           

Review the loadgen configuration

root@loadgen:/opt/loadgen# cat loadgen.yml
statsd:
  enabled: false
  host: 192.168.3.98
  port: 8125
  namespace: loadgen.
variables:
  - name: ip
    type: file
    path: dict/ip.txt
  - name: message
    type: file
    path: dict/nginx.log
#  - name: user
#    type: file
#    path: dict/user.txt
  - name: id
    type: sequence
  - name: uuid
    type: uuid
  - name: now_local
    type: now_local
  - name: now_utc
    type: now_utc
  - name: now_unix
    type: now_unix
  - name: suffix
    type: range
    from: 12
    to: 12
  - name: bool
    type: range
    from: 0
    to: 1
requests:
  - request:
      method: POST
      runtime_variables:
        batch_no: id
      runtime_body_line_variables:
        routing_no: uuid
      basic_auth:
        username: elastic
        password: ####
      url: https://xxx.elasticsearch.xxx.cn:9243/_bulk
      body_repeat_times: 50
      body: "{ \"create\" : { \"_index\" : \"idx-xxxxxx-xxxxxx\",\"_type\":\"_doc\", \"_id\" : \"$[[uuid]]\" , \"routing\" : \"$[[routing_no]]\" } }\n{ \"ownerxxxxxx\" : \"0011WsjCK0oV\", \"servxxxxxx\" : $[[bool]], \"tagxxxxxx\" : \"\", \"contxxxxxx\" : \"\", \"ownexxxxxxunxxxxxx\" : \"$[[bool]]\", \"prxxxxxxentLxxxxxx\" : \"\", \"conxxxxxx\" : \"jpg\", \"uxxxxxxexxxxxxID\" : \"$[[id]]\", \"owxxxxxxccxxxxxxt\" : \"$[[routing_no]]\", \"prxxxxxxtUxxxxxxL\" : \"\", \"durxxxxxxn\" : 0, \"paxxxxxx\" : \"00019700101000000001/0011WsjCK0oV00019700101000000043\", \"crxxxxxx\" : \"$[[id]]\", \"paxxxxxxntxxxxxxogIxxxxxx\" : \"0011WsjCK0oV00019700101000000043\", \"sxxxxxxThumxxxxxx\" : \"http://xxx.xxx.cn:80/storageWeb/servlet/GetFileByURLServlet?root=/mnt/wfs133&fileid=KB1af35f100578d655b2cfbd7edd2cb50e.jpg&ct=1&type=0&code=80B0EAB7F429F1A32F76EB895F5FF4DE1853D254604FAB67A7C33FDF92BE7220&exp=315&account=MTM2MzgzMTU1NjI=&p=0&ui=0011WsjCK0oV&ci=0011WsjCK0oV06320210812125345tcm&userSiteId=usersite-s&cn=mmexport162592513503...&oprChannel=10000000&dom=D961\", \"xxxxxxderAxxxxxxnt\" : \"$[[routing_no]]\", \"upxxxxxxerAcxxxxxxtype\" : \"$[[bool]]\", \"uploaderNDUserID\" : \"$[[uuid]]\", \"largeThumbnail\" : \"http://xxx.xxx.cn:80/storageWeb/servlet/GetFileByURLServlet?root=/mnt/wfs133&fileid=KB1af35f100578d655b2cfbd7edd2cb50e.jpg&ct=1&type=1&code=80B0EAB7F429F1A32F76EB895F5FF4DE1853D254604FAB67A7C33FDF92BE7220&exp=315&account=MTM2MzgzMTU1NjI=&p=0&ui=0011WsjCK0oV&ci=0011WsjCK0oV06320210812125345tcm&userSiteId=usersite-s&cn=mmexport162592513503...&oprChannel=10000000&dom=D961\", \"xxxxxxil\" : \"http://download.xxx.xxx.com:80/storageWeb/servlet/GetFileByURLServlet?root=/mnt/wfs133&fileid=KB1af35f100578d655b2cfbd7edd2cb50e.jpg&ct=1&type=2&code=80B0EAB7F429F1A32F76EB895F5FF4DE1853D254604FAB67A7C33FDF92BE7220&exp=315&account=MTM2MzgzMTU1NjI=&p=0&ui=0011WsjCK0oV&ci=0011WsjCK0oV06320210812125345tcm&userSiteId=usersite-s&cn=mmexport162592513503...&oprChannel=10000000&dom=D961\", \"adxxxxxx\" : \"\", \"upxxxxxx\" : \"$[[now_unix]]\", \"cxxxxxx\" : 3, \"contxxxxxxe\" : $[[bool]], \"prixxxxxx\" : 10, \"conxxxxxx\" : \"0011WsjCK0oV06320210812125345tcm\", \"contxxxxxx\" : $[[id]], \"shoxxxxxx\" : \"$[[id]]\", \"contxxxxxxe\" : \"mmexport1625925135032.jpg\", \"prxxxxxxtHxxxxxx\" : \"\", \"oxxxxxxrBmxxxxxxID\" : \"$[[id]]\", \"md5\" : \"$[[uuid]]\" }\n"


           

Run the test

Enable gzip traffic compression and perform stress tests:

root@loadgen:/opt/loadgen# ./loadgen-linux-amd64 -config loadgen.yml -d 6000 -c 100  -compress
           

1 replica 100 concurrent

Stress testing reports on different shard settings in Elasticsearch

0 replicas 100 concurrent

Stress testing reports on different shard settings in Elasticsearch

0 replicas 200 concurrent

Stress testing reports on different shard settings in Elasticsearch

The write queue already has a lot of accumulation and rejection:

Stress testing reports on different shard settings in Elasticsearch

1 replica 200 concurrent

Stress testing reports on different shard settings in Elasticsearch
Stress testing reports on different shard settings in Elasticsearch

1 replica 400 concurrent

Stress testing reports on different shard settings in Elasticsearch

1 replica 800 concurrent

Stress testing reports on different shard settings in Elasticsearch
Stress testing reports on different shard settings in Elasticsearch

1 replica batch 500 concurrently 100

Stress testing reports on different shard settings in Elasticsearch
Stress testing reports on different shard settings in Elasticsearch

1 replica batch 2000 concurrently 100

Stress testing reports on different shard settings in Elasticsearch
Stress testing reports on different shard settings in Elasticsearch

1 replica batch 5000 concurrency 100

Stress testing reports on different shard settings in Elasticsearch

1 replica batch 5000 concurrently 200

Stress testing reports on different shard settings in Elasticsearch
Stress testing reports on different shard settings in Elasticsearch

Single index 425 shards

Index definition

PUT idx-xxxxxx-xxxxxx-425
{
    "aliases" : {
      "alias-xxxxxx-xxxxxx" : { }
    },
    "mappings" : {
      "dynamic" : "strict",
      "_routing" : {
        "required" : true
      },
      "_source" : {
        "excludes" : [
          "isExtract*",
          "batchNo"
        ]
      },
      "properties" : {
        "addxxxxxx" : {
          "type" : "text",
          "term_vector" : "with_positions_offsets"
        },
        "cxxxxxx" : {
          "type" : "byte"
        },
        "coxxxxxxc" : {
          "type" : "text",
          "boost" : 4.0,
          "term_vector" : "with_positions_offsets"
        },
        "coxxxxxx" : {
          "type" : "keyword",
          "doc_values" : false
        },
        "conxxxxxxe" : {
          "type" : "text",
          "boost" : 16.0,
          "term_vector" : "with_positions_offsets",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "normalizer" : "keyword_normalizer"
            }
          },
          "analyzer" : "name_analyzer",
          "search_analyzer" : "keyword_analyzer"
        },
        "coxxxxxxze" : {
          "type" : "long",
          "index" : false,
          "doc_values" : false
        },
        "conxxxxxxfix" : {
          "type" : "keyword",
          "doc_values" : false
        },
        "coxxxxxxpe" : {
          "type" : "short"
        },
        "cxxxxxxm" : {
          "type" : "date",
          "ignore_malformed" : true,
          "format" : "yyyyMMddHHmmss"
        },
        "duxxxxxxon" : {
          "type" : "long",
          "index" : false,
          "doc_values" : false
        },
        "laxxxxxxbnail" : {
          "type" : "keyword",
          "boost" : 8.0,
          "index" : false,
          "doc_values" : false
        },
        "md5" : {
          "type" : "keyword",
          "index" : false,
          "doc_values" : false
        },
        "ordxxxxxxNamxxxxxx" : {
          "type" : "alias",
          "path" : "contName.keyword"
        },
        "oxxxxxxccoxxxxxxt" : {
          "type" : "keyword",
          "doc_values" : false
        },
        "owxxxxxxcounxxxxxxpe" : {
          "type" : "keyword",
          "doc_values" : false
        },
        "owxxxxxxpUsxxxxxxD" : {
          "type" : "keyword",
          "doc_values" : false
        },
        "oxxxxxxDUsexxxxxxD" : {
          "type" : "keyword",
          "doc_values" : false
        },
        "pxxxxxxtalxxxxxxD" : {
          "type" : "keyword"
        },
        "patxxxxxx" : {
          "type" : "text",
          "boost" : 8.0,
          "term_vector" : "with_positions_offsets",
          "fields" : {
            "keyword" : {
              "type" : "keyword"
            }
          },
          "analyzer" : "path_analyzer"
        },
        "prxxxxxxntHxxxxxx" : {
          "type" : "keyword",
          "boost" : 8.0,
          "index" : false,
          "doc_values" : false
        },
        "prxxxxxxntLxxxxxx" : {
          "type" : "keyword",
          "boost" : 8.0,
          "index" : false,
          "doc_values" : false
        },
        "prxxxxxxURxxxxxx" : {
          "type" : "keyword",
          "boost" : 8.0,
          "index" : false,
          "doc_values" : false
        },
        "pxxxxxxity" : {
          "type" : "short",
          "index" : false
        },
        "sxxxxxxch_nxxxxxxe" : {
          "type" : "alias",
          "path" : "contName"
        },
        "sexxxxxxeTxxxxxxe" : {
          "type" : "byte"
        },
        "sxxxxxxTm" : {
          "type" : "date",
          "ignore_malformed" : true,
          "format" : "yyyyMMddHHmmss"
        },
        "smxxxxxxThuxxxxxxl" : {
          "type" : "keyword",
          "boost" : 8.0,
          "index" : false,
          "doc_values" : false
        },
        "taxxxxxxa" : {
          "type" : "text",
          "term_vector" : "with_positions_offsets"
        },
        "txxxxxxnaxxxxxx" : {
          "type" : "keyword",
          "boost" : 8.0,
          "index" : false,
          "doc_values" : false
        },
        "uxxxxxxm" : {
          "type" : "date",
          "ignore_malformed" : true,
          "format" : "yyyyMMddHHmmss"
        },
        "upxxxxxxdexxxxxxount" : {
          "type" : "keyword",
          "doc_values" : false
        },
        "upxxxxxxrAcxxxxxxpe" : {
          "type" : "keyword",
          "doc_values" : false
        },
        "upxxxxxxmpUsxxxxxx" : {
          "type" : "keyword",
          "doc_values" : false
        },
        "uxxxxxxerNDxxxxxxD" : {
          "type" : "keyword",
          "doc_values" : false
        }
      }
    },
    "settings" : {
      "index" : {
        "max_ngram_diff" : "50",
        "refresh_interval" : "1s",
        "number_of_shards" : "425",

        "analysis" : {
          "normalizer" : {
            "keyword_normalizer" : {
              "filter" : [
                "lowercase"
              ],
              "type" : "custom"
            }
          },
          "analyzer" : {
            "keyword_analyzer" : {
              "filter" : [
                "lowercase"
              ],
              "type" : "custom",
              "tokenizer" : "keyword"
            },
            "name_analyzer" : {
              "filter" : [
                "lowercase"
              ],
              "type" : "custom",
              "tokenizer" : "name_tokenizer"
            },
            "path_analyzer" : {
              "filter" : [
                "lowercase"
              ],
              "type" : "custom",
              "tokenizer" : "path_tokenizer"
            }
          },
          "tokenizer" : {
            "name_tokenizer" : {
              "type" : "ngram",
              "min_gram" : "1",
              "max_gram" : "5"
            },
            "path_tokenizer" : {
              "pattern" : "/",
              "type" : "pattern"
            }
          }
        },
        "number_of_replicas" : "1"

      }
    }
  }
           

1 replica batch 50 concurrency 100

Stress testing reports on different shard settings in Elasticsearch
Stress testing reports on different shard settings in Elasticsearch

1 replica batch, 50, concurrent, 200

1 replica batch 50 concurrency 400

1 replica batch 50 concurrently 800

1 replica batch 500 concurrently 100

Stress testing reports on different shard settings in Elasticsearch
Stress testing reports on different shard settings in Elasticsearch

1 replica batch 2000 concurrently 100

Stress testing reports on different shard settings in Elasticsearch

1 replica batch 5000 concurrency 100

Stress testing reports on different shard settings in Elasticsearch
Stress testing reports on different shard settings in Elasticsearch

Single index with 50 shards

1 replica batch 50 concurrency 100

Stress testing reports on different shard settings in Elasticsearch
Stress testing reports on different shard settings in Elasticsearch

1 replica batch 500 concurrently 100

Stress testing reports on different shard settings in Elasticsearch
Stress testing reports on different shard settings in Elasticsearch

1 replica batch 1000 concurrently 100

Stress testing reports on different shard settings in Elasticsearch
Stress testing reports on different shard settings in Elasticsearch

1 replica batch 5000 concurrency 100

Stress testing reports on different shard settings in Elasticsearch
Stress testing reports on different shard settings in Elasticsearch

Go gateway single index 425 shards

1 replica batch 50 concurrently 400>200

Stress testing reports on different shard settings in Elasticsearch

1 replica batch 500 concurrently 100

Stress testing reports on different shard settings in Elasticsearch

1 replica batch 500 concurrently 200

Stress testing reports on different shard settings in Elasticsearch

1 replica batch 500 concurrent 400

Stress testing reports on different shard settings in Elasticsearch
Stress testing reports on different shard settings in Elasticsearch

1 replica batch 5000 concurrency 100

Stress testing reports on different shard settings in Elasticsearch
Stress testing reports on different shard settings in Elasticsearch
Stress testing reports on different shard settings in Elasticsearch

1 replica batch 5000 concurrently 200

Stress testing reports on different shard settings in Elasticsearch
Stress testing reports on different shard settings in Elasticsearch

1 replica batch 5000 concurrently 400

Stress testing reports on different shard settings in Elasticsearch
Stress testing reports on different shard settings in Elasticsearch

Go gateway single index 850 shards

1 replica batch 50 concurrency 400

Stress testing reports on different shard settings in Elasticsearch

1 replica batch 500 concurrent 400

Stress testing reports on different shard settings in Elasticsearch

1 replica batch 5000 concurrently 400

Stress testing reports on different shard settings in Elasticsearch

Stress test results

Number of indexes Number of shards Number of Aux Copies Batch size Stress test concurrency Average Write Throughput (EPS)
1 850 1 50 100 10,000
1 850 50 100 30,000
1 850 50 200 40,000
1 850 1 50 200 18,000
1 850 1 50 400 27,500
1 850 1 50 800 29,700
1 850 1 500 100 30,187
1 850 1 2000 100 68,000
1 850 1 5000 100 98,915
1 850 1 5000 200 78,462
1 425 1 50 100 12,695
1 425 1 500 100 46818
1 425 1 2000 100 100,000
1 425 1 5000 100 130,000
1 50 1 50 100 32,987
1 50 1 500 100 96,207
1 50 1 1000 100 147,719
1 50 1 5000 100 156,961

Follow the gateway node asynchronous merge mode:

Number of indexes Number of shards Number of Aux Copies Batch size Stress test concurrency Average Write Throughput (EPS)
1 425 1 50 100 500
1 425 1 50 200 1,000
1 425 1 50 400 2,000
1 425 1 500 100 4,800
1 425 1 500 200 9,350
1 425 1 500 400 17,000
1 425 1 5000 100 50,000
1 425 1 5000 200 100,000
1 425 1 5000 400 175,000
1 850 1 50 400 2000
1 850 1 500 400 18,800
1 850 1 5000 400 137,000

conclusion

Large shard indexes, 850 or 425, may fill the thread pool and request rejections in the case of concurrency even if there is only 100, and it is more likely to occur when the number of documents in a single batch is relatively small. For indexes of the same format, in the case of 50 shards, the throughput of the index is twice that of 425 shards and three times that of 850 shards, and the thread pool is basically not stacked, or the accumulation is processed quickly. The higher the number of documents in a single request, the more efficient the write. In some scenarios, although the index shards are routed, but the ultra-large shard index has serious forwarding efficiency problems, it is recommended to divide the index according to the business dimension or the current routing dimension, split the super-large index into several sub-indexes, and the number of shards of a single index should not exceed 20.