A set of analyzers aimed at analyzing specific language text. The
following types are supported:
arabic,
armenian,
basque,
bengali,
brazilian,
bulgarian,
catalan,
cjk,
czech,
danish,
dutch,
english,
estonian,
finnish,
french,
galician,
german,
greek,
hindi,
hungarian,
indonesian,
irish,
italian,
latvian,
lithuanian,
norwegian,
persian,
portuguese,
romanian,
russian,
sorani,
spanish,
swedish,
turkish,
thai.
All analyzers support setting custom stopwords either internally in
the config, or by using an external stopwords file by setting
stopwords_path. Check Stop Analyzer for
more details.
The stem_exclusion parameter allows you to specify an array
of lowercase words that should not be stemmed. Internally, this
functionality is implemented by adding the
keyword_marker token filter
with the keywords set to the value of the stem_exclusion parameter.
The following analyzers support setting custom stem_exclusion list:
arabic, armenian, basque, bengali, bulgarian, catalan, czech,
dutch, english, finnish, french, galician,
german, hindi, hungarian, indonesian, irish, italian, latvian,
lithuanian, norwegian, portuguese, romanian, russian, sorani,
spanish, swedish, turkish.
The built-in language analyzers can be reimplemented as custom analyzers
(as described below) in order to customize their behaviour.
If you do not intend to exclude words from being stemmed (the
equivalent of the stem_exclusion parameter above), then you should remove
the keyword_marker token filter from the custom analyzer configuration.
The arabic analyzer could be reimplemented as a custom analyzer as follows:
PUT /arabic_example
{
"settings": {
"analysis": {
"filter": {
"arabic_stop": {
"type": "stop",
"stopwords": "_arabic_"
},
"arabic_keywords": {
"type": "keyword_marker",
"keywords": ["مثال"]
},
"arabic_stemmer": {
"type": "stemmer",
"language": "arabic"
}
},
"analyzer": {
"rebuilt_arabic": {
"tokenizer": "standard",
"filter": [
"lowercase",
"decimal_digit",
"arabic_stop",
"arabic_normalization",
"arabic_keywords",
"arabic_stemmer"
]
}
}
}
}
}
The armenian analyzer could be reimplemented as a custom analyzer as follows:
PUT /armenian_example
{
"settings": {
"analysis": {
"filter": {
"armenian_stop": {
"type": "stop",
"stopwords": "_armenian_"
},
"armenian_keywords": {
"type": "keyword_marker",
"keywords": ["օրինակ"]
},
"armenian_stemmer": {
"type": "stemmer",
"language": "armenian"
}
},
"analyzer": {
"rebuilt_armenian": {
"tokenizer": "standard",
"filter": [
"lowercase",
"armenian_stop",
"armenian_keywords",
"armenian_stemmer"
]
}
}
}
}
}
The basque analyzer could be reimplemented as a custom analyzer as follows:
PUT /basque_example
{
"settings": {
"analysis": {
"filter": {
"basque_stop": {
"type": "stop",
"stopwords": "_basque_"
},
"basque_keywords": {
"type": "keyword_marker",
"keywords": ["Adibidez"]
},
"basque_stemmer": {
"type": "stemmer",
"language": "basque"
}
},
"analyzer": {
"rebuilt_basque": {
"tokenizer": "standard",
"filter": [
"lowercase",
"basque_stop",
"basque_keywords",
"basque_stemmer"
]
}
}
}
}
}
The bengali analyzer could be reimplemented as a custom analyzer as follows:
PUT /bengali_example
{
"settings": {
"analysis": {
"filter": {
"bengali_stop": {
"type": "stop",
"stopwords": "_bengali_"
},
"bengali_keywords": {
"type": "keyword_marker",
"keywords": ["উদাহরণ"]
},
"bengali_stemmer": {
"type": "stemmer",
"language": "bengali"
}
},
"analyzer": {
"rebuilt_bengali": {
"tokenizer": "standard",
"filter": [
"lowercase",
"decimal_digit",
"bengali_keywords",
"indic_normalization",
"bengali_normalization",
"bengali_stop",
"bengali_stemmer"
]
}
}
}
}
}
The brazilian analyzer could be reimplemented as a custom analyzer as follows:
PUT /brazilian_example
{
"settings": {
"analysis": {
"filter": {
"brazilian_stop": {
"type": "stop",
"stopwords": "_brazilian_"
},
"brazilian_keywords": {
"type": "keyword_marker",
"keywords": ["exemplo"]
},
"brazilian_stemmer": {
"type": "stemmer",
"language": "brazilian"
}
},
"analyzer": {
"rebuilt_brazilian": {
"tokenizer": "standard",
"filter": [
"lowercase",
"brazilian_stop",
"brazilian_keywords",
"brazilian_stemmer"
]
}
}
}
}
}
The bulgarian analyzer could be reimplemented as a custom analyzer as follows:
PUT /bulgarian_example
{
"settings": {
"analysis": {
"filter": {
"bulgarian_stop": {
"type": "stop",
"stopwords": "_bulgarian_"
},
"bulgarian_keywords": {
"type": "keyword_marker",
"keywords": ["пример"]
},
"bulgarian_stemmer": {
"type": "stemmer",
"language": "bulgarian"
}
},
"analyzer": {
"rebuilt_bulgarian": {
"tokenizer": "standard",
"filter": [
"lowercase",
"bulgarian_stop",
"bulgarian_keywords",
"bulgarian_stemmer"
]
}
}
}
}
}
The catalan analyzer could be reimplemented as a custom analyzer as follows:
PUT /catalan_example
{
"settings": {
"analysis": {
"filter": {
"catalan_elision": {
"type": "elision",
"articles": [ "d", "l", "m", "n", "s", "t"],
"articles_case": true
},
"catalan_stop": {
"type": "stop",
"stopwords": "_catalan_"
},
"catalan_keywords": {
"type": "keyword_marker",
"keywords": ["example"]
},
"catalan_stemmer": {
"type": "stemmer",
"language": "catalan"
}
},
"analyzer": {
"rebuilt_catalan": {
"tokenizer": "standard",
"filter": [
"catalan_elision",
"lowercase",
"catalan_stop",
"catalan_keywords",
"catalan_stemmer"
]
}
}
}
}
}
You may find that icu_analyzer in the ICU analysis plugin works better
for CJK text than the cjk analyzer. Experiment with your text and queries.
The cjk analyzer could be reimplemented as a custom analyzer as follows:
PUT /cjk_example
{
"settings": {
"analysis": {
"filter": {
"english_stop": {
"type": "stop",
"stopwords": [
"a", "and", "are", "as", "at", "be", "but", "by", "for",
"if", "in", "into", "is", "it", "no", "not", "of", "on",
"or", "s", "such", "t", "that", "the", "their", "then",
"there", "these", "they", "this", "to", "was", "will",
"with", "www"
]
}
},
"analyzer": {
"rebuilt_cjk": {
"tokenizer": "standard",
"filter": [
"cjk_width",
"lowercase",
"cjk_bigram",
"english_stop"
]
}
}
}
}
}
The czech analyzer could be reimplemented as a custom analyzer as follows:
PUT /czech_example
{
"settings": {
"analysis": {
"filter": {
"czech_stop": {
"type": "stop",
"stopwords": "_czech_"
},
"czech_keywords": {
"type": "keyword_marker",
"keywords": ["příklad"]
},
"czech_stemmer": {
"type": "stemmer",
"language": "czech"
}
},
"analyzer": {
"rebuilt_czech": {
"tokenizer": "standard",
"filter": [
"lowercase",
"czech_stop",
"czech_keywords",
"czech_stemmer"
]
}
}
}
}
}
The danish analyzer could be reimplemented as a custom analyzer as follows:
PUT /danish_example
{
"settings": {
"analysis": {
"filter": {
"danish_stop": {
"type": "stop",
"stopwords": "_danish_"
},
"danish_keywords": {
"type": "keyword_marker",
"keywords": ["eksempel"]
},
"danish_stemmer": {
"type": "stemmer",
"language": "danish"
}
},
"analyzer": {
"rebuilt_danish": {
"tokenizer": "standard",
"filter": [
"lowercase",
"danish_stop",
"danish_keywords",
"danish_stemmer"
]
}
}
}
}
}
The dutch analyzer could be reimplemented as a custom analyzer as follows:
PUT /dutch_example
{
"settings": {
"analysis": {
"filter": {
"dutch_stop": {
"type": "stop",
"stopwords": "_dutch_"
},
"dutch_keywords": {
"type": "keyword_marker",
"keywords": ["voorbeeld"]
},
"dutch_stemmer": {
"type": "stemmer",
"language": "dutch"
},
"dutch_override": {
"type": "stemmer_override",
"rules": [
"fiets=>fiets",
"bromfiets=>bromfiets",
"ei=>eier",
"kind=>kinder"
]
}
},
"analyzer": {
"rebuilt_dutch": {
"tokenizer": "standard",
"filter": [
"lowercase",
"dutch_stop",
"dutch_keywords",
"dutch_override",
"dutch_stemmer"
]
}
}
}
}
}
The english analyzer could be reimplemented as a custom analyzer as follows:
PUT /english_example
{
"settings": {
"analysis": {
"filter": {
"english_stop": {
"type": "stop",
"stopwords": "_english_"
},
"english_keywords": {
"type": "keyword_marker",
"keywords": ["example"]
},
"english_stemmer": {
"type": "stemmer",
"language": "english"
},
"english_possessive_stemmer": {
"type": "stemmer",
"language": "possessive_english"
}
},
"analyzer": {
"rebuilt_english": {
"tokenizer": "standard",
"filter": [
"english_possessive_stemmer",
"lowercase",
"english_stop",
"english_keywords",
"english_stemmer"
]
}
}
}
}
}
The estonian analyzer could be reimplemented as a custom analyzer as follows:
PUT /estonian_example
{
"settings": {
"analysis": {
"filter": {
"estonian_stop": {
"type": "stop",
"stopwords": "_estonian_"
},
"estonian_keywords": {
"type": "keyword_marker",
"keywords": ["näide"]
},
"estonian_stemmer": {
"type": "stemmer",
"language": "estonian"
}
},
"analyzer": {
"rebuilt_estonian": {
"tokenizer": "standard",
"filter": [
"lowercase",
"estonian_stop",
"estonian_keywords",
"estonian_stemmer"
]
}
}
}
}
}
The finnish analyzer could be reimplemented as a custom analyzer as follows:
PUT /finnish_example
{
"settings": {
"analysis": {
"filter": {
"finnish_stop": {
"type": "stop",
"stopwords": "_finnish_"
},
"finnish_keywords": {
"type": "keyword_marker",
"keywords": ["esimerkki"]
},
"finnish_stemmer": {
"type": "stemmer",
"language": "finnish"
}
},
"analyzer": {
"rebuilt_finnish": {
"tokenizer": "standard",
"filter": [
"lowercase",
"finnish_stop",
"finnish_keywords",
"finnish_stemmer"
]
}
}
}
}
}
The french analyzer could be reimplemented as a custom analyzer as follows:
PUT /french_example
{
"settings": {
"analysis": {
"filter": {
"french_elision": {
"type": "elision",
"articles_case": true,
"articles": [
"l", "m", "t", "qu", "n", "s",
"j", "d", "c", "jusqu", "quoiqu",
"lorsqu", "puisqu"
]
},
"french_stop": {
"type": "stop",
"stopwords": "_french_"
},
"french_keywords": {
"type": "keyword_marker",
"keywords": ["Example"]
},
"french_stemmer": {
"type": "stemmer",
"language": "light_french"
}
},
"analyzer": {
"rebuilt_french": {
"tokenizer": "standard",
"filter": [
"french_elision",
"lowercase",
"french_stop",
"french_keywords",
"french_stemmer"
]
}
}
}
}
}
The galician analyzer could be reimplemented as a custom analyzer as follows:
PUT /galician_example
{
"settings": {
"analysis": {
"filter": {
"galician_stop": {
"type": "stop",
"stopwords": "_galician_"
},
"galician_keywords": {
"type": "keyword_marker",
"keywords": ["exemplo"]
},
"galician_stemmer": {
"type": "stemmer",
"language": "galician"
}
},
"analyzer": {
"rebuilt_galician": {
"tokenizer": "standard",
"filter": [
"lowercase",
"galician_stop",
"galician_keywords",
"galician_stemmer"
]
}
}
}
}
}
The german analyzer could be reimplemented as a custom analyzer as follows:
PUT /german_example
{
"settings": {
"analysis": {
"filter": {
"german_stop": {
"type": "stop",
"stopwords": "_german_"
},
"german_keywords": {
"type": "keyword_marker",
"keywords": ["Beispiel"]
},
"german_stemmer": {
"type": "stemmer",
"language": "light_german"
}
},
"analyzer": {
"rebuilt_german": {
"tokenizer": "standard",
"filter": [
"lowercase",
"german_stop",
"german_keywords",
"german_normalization",
"german_stemmer"
]
}
}
}
}
}
The greek analyzer could be reimplemented as a custom analyzer as follows:
PUT /greek_example
{
"settings": {
"analysis": {
"filter": {
"greek_stop": {
"type": "stop",
"stopwords": "_greek_"
},
"greek_lowercase": {
"type": "lowercase",
"language": "greek"
},
"greek_keywords": {
"type": "keyword_marker",
"keywords": ["παράδειγμα"]
},
"greek_stemmer": {
"type": "stemmer",
"language": "greek"
}
},
"analyzer": {
"rebuilt_greek": {
"tokenizer": "standard",
"filter": [
"greek_lowercase",
"greek_stop",
"greek_keywords",
"greek_stemmer"
]
}
}
}
}
}
The hindi analyzer could be reimplemented as a custom analyzer as follows:
PUT /hindi_example
{
"settings": {
"analysis": {
"filter": {
"hindi_stop": {
"type": "stop",
"stopwords": "_hindi_"
},
"hindi_keywords": {
"type": "keyword_marker",
"keywords": ["उदाहरण"]
},
"hindi_stemmer": {
"type": "stemmer",
"language": "hindi"
}
},
"analyzer": {
"rebuilt_hindi": {
"tokenizer": "standard",
"filter": [
"lowercase",
"decimal_digit",
"hindi_keywords",
"indic_normalization",
"hindi_normalization",
"hindi_stop",
"hindi_stemmer"
]
}
}
}
}
}
The hungarian analyzer could be reimplemented as a custom analyzer as follows:
PUT /hungarian_example
{
"settings": {
"analysis": {
"filter": {
"hungarian_stop": {
"type": "stop",
"stopwords": "_hungarian_"
},
"hungarian_keywords": {
"type": "keyword_marker",
"keywords": ["példa"]
},
"hungarian_stemmer": {
"type": "stemmer",
"language": "hungarian"
}
},
"analyzer": {
"rebuilt_hungarian": {
"tokenizer": "standard",
"filter": [
"lowercase",
"hungarian_stop",
"hungarian_keywords",
"hungarian_stemmer"
]
}
}
}
}
}
The indonesian analyzer could be reimplemented as a custom analyzer as follows:
PUT /indonesian_example
{
"settings": {
"analysis": {
"filter": {
"indonesian_stop": {
"type": "stop",
"stopwords": "_indonesian_"
},
"indonesian_keywords": {
"type": "keyword_marker",
"keywords": ["contoh"]
},
"indonesian_stemmer": {
"type": "stemmer",
"language": "indonesian"
}
},
"analyzer": {
"rebuilt_indonesian": {
"tokenizer": "standard",
"filter": [
"lowercase",
"indonesian_stop",
"indonesian_keywords",
"indonesian_stemmer"
]
}
}
}
}
}
The irish analyzer could be reimplemented as a custom analyzer as follows:
PUT /irish_example
{
"settings": {
"analysis": {
"filter": {
"irish_hyphenation": {
"type": "stop",
"stopwords": [ "h", "n", "t" ],
"ignore_case": true
},
"irish_elision": {
"type": "elision",
"articles": [ "d", "m", "b" ],
"articles_case": true
},
"irish_stop": {
"type": "stop",
"stopwords": "_irish_"
},
"irish_lowercase": {
"type": "lowercase",
"language": "irish"
},
"irish_keywords": {
"type": "keyword_marker",
"keywords": ["sampla"]
},
"irish_stemmer": {
"type": "stemmer",
"language": "irish"
}
},
"analyzer": {
"rebuilt_irish": {
"tokenizer": "standard",
"filter": [
"irish_hyphenation",
"irish_elision",
"irish_lowercase",
"irish_stop",
"irish_keywords",
"irish_stemmer"
]
}
}
}
}
}
The italian analyzer could be reimplemented as a custom analyzer as follows:
PUT /italian_example
{
"settings": {
"analysis": {
"filter": {
"italian_elision": {
"type": "elision",
"articles": [
"c", "l", "all", "dall", "dell",
"nell", "sull", "coll", "pell",
"gl", "agl", "dagl", "degl", "negl",
"sugl", "un", "m", "t", "s", "v", "d"
],
"articles_case": true
},
"italian_stop": {
"type": "stop",
"stopwords": "_italian_"
},
"italian_keywords": {
"type": "keyword_marker",
"keywords": ["esempio"]
},
"italian_stemmer": {
"type": "stemmer",
"language": "light_italian"
}
},
"analyzer": {
"rebuilt_italian": {
"tokenizer": "standard",
"filter": [
"italian_elision",
"lowercase",
"italian_stop",
"italian_keywords",
"italian_stemmer"
]
}
}
}
}
}
The latvian analyzer could be reimplemented as a custom analyzer as follows:
PUT /latvian_example
{
"settings": {
"analysis": {
"filter": {
"latvian_stop": {
"type": "stop",
"stopwords": "_latvian_"
},
"latvian_keywords": {
"type": "keyword_marker",
"keywords": ["piemērs"]
},
"latvian_stemmer": {
"type": "stemmer",
"language": "latvian"
}
},
"analyzer": {
"rebuilt_latvian": {
"tokenizer": "standard",
"filter": [
"lowercase",
"latvian_stop",
"latvian_keywords",
"latvian_stemmer"
]
}
}
}
}
}
The lithuanian analyzer could be reimplemented as a custom analyzer as follows:
PUT /lithuanian_example
{
"settings": {
"analysis": {
"filter": {
"lithuanian_stop": {
"type": "stop",
"stopwords": "_lithuanian_"
},
"lithuanian_keywords": {
"type": "keyword_marker",
"keywords": ["pavyzdys"]
},
"lithuanian_stemmer": {
"type": "stemmer",
"language": "lithuanian"
}
},
"analyzer": {
"rebuilt_lithuanian": {
"tokenizer": "standard",
"filter": [
"lowercase",
"lithuanian_stop",
"lithuanian_keywords",
"lithuanian_stemmer"
]
}
}
}
}
}
The norwegian analyzer could be reimplemented as a custom analyzer as follows:
PUT /norwegian_example
{
"settings": {
"analysis": {
"filter": {
"norwegian_stop": {
"type": "stop",
"stopwords": "_norwegian_"
},
"norwegian_keywords": {
"type": "keyword_marker",
"keywords": ["eksempel"]
},
"norwegian_stemmer": {
"type": "stemmer",
"language": "norwegian"
}
},
"analyzer": {
"rebuilt_norwegian": {
"tokenizer": "standard",
"filter": [
"lowercase",
"norwegian_stop",
"norwegian_keywords",
"norwegian_stemmer"
]
}
}
}
}
}
The persian analyzer could be reimplemented as a custom analyzer as follows:
PUT /persian_example
{
"settings": {
"analysis": {
"char_filter": {
"zero_width_spaces": {
"type": "mapping",
"mappings": [ "\\u200C=>\\u0020"]
}
},
"filter": {
"persian_stop": {
"type": "stop",
"stopwords": "_persian_"
}
},
"analyzer": {
"rebuilt_persian": {
"tokenizer": "standard",
"char_filter": [ "zero_width_spaces" ],
"filter": [
"lowercase",
"decimal_digit",
"arabic_normalization",
"persian_normalization",
"persian_stop"
]
}
}
}
}
}
The portuguese analyzer could be reimplemented as a custom analyzer as follows:
PUT /portuguese_example
{
"settings": {
"analysis": {
"filter": {
"portuguese_stop": {
"type": "stop",
"stopwords": "_portuguese_"
},
"portuguese_keywords": {
"type": "keyword_marker",
"keywords": ["exemplo"]
},
"portuguese_stemmer": {
"type": "stemmer",
"language": "light_portuguese"
}
},
"analyzer": {
"rebuilt_portuguese": {
"tokenizer": "standard",
"filter": [
"lowercase",
"portuguese_stop",
"portuguese_keywords",
"portuguese_stemmer"
]
}
}
}
}
}
The romanian analyzer could be reimplemented as a custom analyzer as follows:
PUT /romanian_example
{
"settings": {
"analysis": {
"filter": {
"romanian_stop": {
"type": "stop",
"stopwords": "_romanian_"
},
"romanian_keywords": {
"type": "keyword_marker",
"keywords": ["exemplu"]
},
"romanian_stemmer": {
"type": "stemmer",
"language": "romanian"
}
},
"analyzer": {
"rebuilt_romanian": {
"tokenizer": "standard",
"filter": [
"lowercase",
"romanian_stop",
"romanian_keywords",
"romanian_stemmer"
]
}
}
}
}
}
The russian analyzer could be reimplemented as a custom analyzer as follows:
PUT /russian_example
{
"settings": {
"analysis": {
"filter": {
"russian_stop": {
"type": "stop",
"stopwords": "_russian_"
},
"russian_keywords": {
"type": "keyword_marker",
"keywords": ["пример"]
},
"russian_stemmer": {
"type": "stemmer",
"language": "russian"
}
},
"analyzer": {
"rebuilt_russian": {
"tokenizer": "standard",
"filter": [
"lowercase",
"russian_stop",
"russian_keywords",
"russian_stemmer"
]
}
}
}
}
}
The sorani analyzer could be reimplemented as a custom analyzer as follows:
PUT /sorani_example
{
"settings": {
"analysis": {
"filter": {
"sorani_stop": {
"type": "stop",
"stopwords": "_sorani_"
},
"sorani_keywords": {
"type": "keyword_marker",
"keywords": ["mînak"]
},
"sorani_stemmer": {
"type": "stemmer",
"language": "sorani"
}
},
"analyzer": {
"rebuilt_sorani": {
"tokenizer": "standard",
"filter": [
"sorani_normalization",
"lowercase",
"decimal_digit",
"sorani_stop",
"sorani_keywords",
"sorani_stemmer"
]
}
}
}
}
}
The spanish analyzer could be reimplemented as a custom analyzer as follows:
PUT /spanish_example
{
"settings": {
"analysis": {
"filter": {
"spanish_stop": {
"type": "stop",
"stopwords": "_spanish_"
},
"spanish_keywords": {
"type": "keyword_marker",
"keywords": ["ejemplo"]
},
"spanish_stemmer": {
"type": "stemmer",
"language": "light_spanish"
}
},
"analyzer": {
"rebuilt_spanish": {
"tokenizer": "standard",
"filter": [
"lowercase",
"spanish_stop",
"spanish_keywords",
"spanish_stemmer"
]
}
}
}
}
}
The swedish analyzer could be reimplemented as a custom analyzer as follows:
PUT /swedish_example
{
"settings": {
"analysis": {
"filter": {
"swedish_stop": {
"type": "stop",
"stopwords": "_swedish_"
},
"swedish_keywords": {
"type": "keyword_marker",
"keywords": ["exempel"]
},
"swedish_stemmer": {
"type": "stemmer",
"language": "swedish"
}
},
"analyzer": {
"rebuilt_swedish": {
"tokenizer": "standard",
"filter": [
"lowercase",
"swedish_stop",
"swedish_keywords",
"swedish_stemmer"
]
}
}
}
}
}
The turkish analyzer could be reimplemented as a custom analyzer as follows:
PUT /turkish_example
{
"settings": {
"analysis": {
"filter": {
"turkish_stop": {
"type": "stop",
"stopwords": "_turkish_"
},
"turkish_lowercase": {
"type": "lowercase",
"language": "turkish"
},
"turkish_keywords": {
"type": "keyword_marker",
"keywords": ["örnek"]
},
"turkish_stemmer": {
"type": "stemmer",
"language": "turkish"
}
},
"analyzer": {
"rebuilt_turkish": {
"tokenizer": "standard",
"filter": [
"apostrophe",
"turkish_lowercase",
"turkish_stop",
"turkish_keywords",
"turkish_stemmer"
]
}
}
}
}
}
The thai analyzer could be reimplemented as a custom analyzer as follows: