Skip to content
This repository was archived by the owner on Jan 10, 2025. It is now read-only.

Commit 31d3d84

Browse files
authored
Updates to ML Query Optimization (#352)
Updates to ML Query Optimization to add doc2query approach
1 parent 5004985 commit 31d3d84

14 files changed

+6613
-40
lines changed

Machine Learning/Query Optimization/README.md

Lines changed: 135 additions & 6 deletions
Large diffs are not rendered by default.

Machine Learning/Query Optimization/bin/optimize-query

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,8 @@ def main():
6262
print(f" - selected method: {config.selected_method}")
6363
print(f" - default params: {json.dumps(config.default)}")
6464

65-
def logger(iteration, score, params):
66-
print(f" - iteration {iteration} scored {score:.04f} with: {json.dumps(params)}")
65+
def logger(iteration, total_iterations, score, _, duration, params):
66+
print(f" - iteration {iteration}/{total_iterations} ({duration:.04f}s) scored {score:.04f} with: {json.dumps(params)}")
6767

6868
with Timer() as t:
6969
best_score, best_params, final_params, _ = optimize_query(
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
{
2+
"settings": {
3+
"index": {
4+
"number_of_shards": 1,
5+
"number_of_replicas": 0,
6+
"similarity": {
7+
"bm25-url": { "type": "BM25" },
8+
"bm25-title": { "type": "BM25" },
9+
"bm25-title-bigrams": { "type": "BM25" },
10+
"bm25-body": { "type": "BM25" },
11+
"bm25-body-bigrams": { "type": "BM25" },
12+
"bm25-expansions": { "type": "BM25" },
13+
"bm25-expansions-bigrams": { "type": "BM25" }
14+
}
15+
},
16+
"analysis": {
17+
"tokenizer": {
18+
"non_word_pattern_tokenizer": {
19+
"type": "pattern",
20+
"pattern": "[\\W_]+",
21+
"flags": "CASE_INSENSITIVE"
22+
}
23+
},
24+
"filter": {
25+
"english_stop": {
26+
"type": "stop",
27+
"stopwords": "_english_"
28+
},
29+
"english_stop_questions": {
30+
"type": "stop",
31+
"stopwords": [
32+
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with",
33+
"who", "what", "when", "where", "why", "how"
34+
]
35+
},
36+
"english_stop_url": {
37+
"type": "stop",
38+
"stopwords": [
39+
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with",
40+
"who", "what", "when", "where", "why", "how",
41+
"http", "https", "www",
42+
"gov", "edu",
43+
"com", "net", "org", "info", "biz", "de", "ru", "icu", "uk", "xyz", "top", "cn", "nl", "online", "site", "se", "fr", "it", "eu", "wang", "club",
44+
"html", "htm", "xhtml", "js", "css", "cgi", "dll", "exe", "php", "asp", "aspx", "jsp", "do", "cfm"
45+
]
46+
},
47+
"english_stemmer": {
48+
"type": "stemmer",
49+
"language": "english"
50+
},
51+
"english_possessive_stemmer": {
52+
"type": "stemmer",
53+
"language": "possessive_english"
54+
},
55+
"bigrammer": {
56+
"type": "shingle",
57+
"max_shingle_size": 2,
58+
"min_shingle_size": 2,
59+
"output_unigrams": "false"
60+
}
61+
},
62+
"analyzer": {
63+
"url": {
64+
"tokenizer": "non_word_pattern_tokenizer",
65+
"filter": [
66+
"english_possessive_stemmer",
67+
"lowercase",
68+
"english_stop_url",
69+
"english_stemmer"
70+
]
71+
},
72+
"english_questions": {
73+
"tokenizer": "standard",
74+
"filter": [
75+
"english_possessive_stemmer",
76+
"lowercase",
77+
"english_stop_questions",
78+
"english_stemmer"
79+
]
80+
},
81+
"english_bigrams": {
82+
"tokenizer": "standard",
83+
"filter": [
84+
"english_possessive_stemmer",
85+
"lowercase",
86+
"english_stop_questions",
87+
"english_stemmer",
88+
"bigrammer"
89+
]
90+
}
91+
}
92+
}
93+
},
94+
"mappings": {
95+
"dynamic": "strict",
96+
"properties": {
97+
"id": {
98+
"ignore_above": 1024,
99+
"type": "keyword"
100+
},
101+
"url": {
102+
"type": "text",
103+
"analyzer": "url",
104+
"similarity": "bm25-url"
105+
},
106+
"title": {
107+
"type": "text",
108+
"analyzer": "english_questions",
109+
"similarity": "bm25-title",
110+
"fields": {
111+
"bigrams": {
112+
"type": "text",
113+
"analyzer": "english_bigrams",
114+
"similarity": "bm25-title-bigrams"
115+
}
116+
}
117+
},
118+
"body": {
119+
"type": "text",
120+
"analyzer": "english_questions",
121+
"similarity": "bm25-body",
122+
"fields": {
123+
"bigrams": {
124+
"type": "text",
125+
"analyzer": "english_bigrams",
126+
"similarity": "bm25-body-bigrams"
127+
}
128+
}
129+
},
130+
"expansions": {
131+
"type": "text",
132+
"analyzer": "english_questions",
133+
"similarity": "bm25-expansions",
134+
"fields": {
135+
"bigrams": {
136+
"type": "text",
137+
"analyzer": "english_bigrams",
138+
"similarity": "bm25-expansions-bigrams"
139+
}
140+
}
141+
}
142+
}
143+
}
144+
}
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
[
2+
{
3+
"id": "best_fields",
4+
"template": {
5+
"lang": "mustache",
6+
"source": {
7+
"query": {
8+
"multi_match": {
9+
"type": "best_fields",
10+
"query": "{{query_string}}",
11+
"tie_breaker": "{{tie_breaker}}",
12+
"fields": [
13+
"url^{{url|boost}}",
14+
"title^{{title|boost}}",
15+
"title.bigrams^{{title_bigrams|boost}}",
16+
"body^{{body|boost}}",
17+
"body.bigrams^{{body_bigrams|boost}}",
18+
"expansions^{{expansions|boost}}",
19+
"expansions.bigrams^{{expansions_bigrams|boost}}"
20+
]
21+
}
22+
}
23+
}
24+
}
25+
},
26+
{
27+
"id": "most_fields",
28+
"template": {
29+
"lang": "mustache",
30+
"source": {
31+
"query": {
32+
"multi_match": {
33+
"type": "most_fields",
34+
"query": "{{query_string}}",
35+
"fields": [
36+
"url^{{url|boost}}",
37+
"title^{{title|boost}}",
38+
"title.bigrams^{{title_bigrams|boost}}",
39+
"body^{{body|boost}}",
40+
"body.bigrams^{{body_bigrams|boost}}",
41+
"expansions^{{expansions|boost}}",
42+
"expansions.bigrams^{{expansions_bigrams|boost}}"
43+
]
44+
}
45+
}
46+
}
47+
}
48+
}
49+
]

Machine Learning/Query Optimization/config/msmarco-document-templates.json

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,25 @@
4141
}
4242
}
4343
},
44+
{
45+
"id": "most_fields",
46+
"template": {
47+
"lang": "mustache",
48+
"source": {
49+
"query": {
50+
"multi_match": {
51+
"type": "most_fields",
52+
"query": "{{query_string}}",
53+
"fields": [
54+
"url^{{url|boost}}",
55+
"title^{{title|boost}}",
56+
"body^{{body|boost}}"
57+
]
58+
}
59+
}
60+
}
61+
}
62+
},
4463
{
4564
"id": "combined_matches",
4665
"template": {

0 commit comments

Comments
 (0)