debeir.engines.elasticsearch.generate_script_score

  1import copy
  2from typing import Dict, Union
  3
  4base_script = {
  5    "lang": "painless",
  6    # Compute faster dot products as all vectors are unit length
  7    "source": None,
  8    "params": None,
  9}
 10
 11
 12class SourceBuilder:
 13    """
 14    Builds Script Score source for NIR-style queries in elasticsearch
 15    Uses the painless language
 16
 17    This is a string builder class
 18    """
 19
 20    def __init__(self):
 21        self.s = ""
 22        self.i = 0
 23        self.variables = []
 24
 25    def _add_line(self, line):
 26        self.s = self.s + line.strip() + "\n"
 27
 28    def add_preamble(self):
 29        """
 30        Adds preamble to the internal string
 31        This will return the bm25 score if the normalization constant is below 0
 32        """
 33        self._add_line(
 34            """
 35            if (params.norm_weight < 0.0) {
 36                return _score;
 37            }
 38        """
 39        )
 40
 41    def add_log_score(self, ignore_below_one=False) -> "SourceBuilder":
 42        """
 43        Adds the BM25 log score line
 44        :param ignore_below_one: Ignore all scores below 1.0 as Log(1) = 0. Otherwise, just ignore Log(0 and under).
 45        :return:
 46            SourceBuilder
 47        """
 48        if ignore_below_one:
 49            self._add_line(
 50                # "def log_score = _score < 1.0 ? 0.0 : Math.log(_score)/Math.log(params.norm_weight);"
 51                "def log_score = params.disable_bm25 ? 0.0 : Math.log(_score)/Math.log(params.norm_weight);"
 52                # "def log_score = Math.log(_score)/Math.log(params.norm_weight);"
 53            )
 54        else:
 55            self._add_line(
 56                "def log_score = _score <= 0.0 ? 0.0 : Math.log(_score)/Math.log(params.norm_weight);"
 57                # "def log_score = Math.log(_score)/Math.log(params.norm_weight);"
 58            )
 59
 60        return self
 61
 62    def add_embed_field(self, qfield, field) -> "SourceBuilder":
 63        """
 64        Adds a cosine score line.
 65        :param qfield: Query field
 66        :param field: Document facet field
 67        :return:
 68        """
 69        if "embedding" not in field.lower():
 70            field = field.replace(".", "_") + "_Embedding"
 71
 72        variable_name = f"{field}_{qfield}_score"
 73
 74        self._add_line(
 75            f"double {variable_name} = doc['{field}'].isEmpty() ? 0.0 : params.weights[{self.i}]*cosineSimilarity(params.{qfield}"
 76            f", '{field}') + params.offset; "
 77            # f"double {variable_name} = cosineSimilarity(params.{qfield}, '{field}') + 1.0; "
 78        )
 79        self.variables.append(variable_name)
 80
 81        self.i += 1
 82
 83        return self
 84
 85    def finish(self):
 86        """
 87        Finalises the script score and returns the internal string
 88        :return:
 89            A string containing the script score query
 90        """
 91        self._add_line("double embed_score = " + " + ".join(self.variables) + ";")
 92        self._add_line(
 93            # "return params.disable_bm25 == true ? embed_score : embed_score + log_score;"
 94            "return embed_score + log_score;"
 95        )
 96
 97        return self.s
 98
 99
100def generate_source(qfields: Union[list, str], fields) -> str:
101    """
102    Generates the script source based off a set of input fields and facets
103
104    :param qfields: Query fields (or topic fields)
105    :param fields: Document facets to compute cosine similarity on
106    :return:
107    """
108    sb = SourceBuilder()
109    sb.add_log_score(ignore_below_one=True)
110
111    if isinstance(qfields, str):
112        qfields = [qfields]
113
114    for qfield in qfields:
115        for field in fields:
116            sb.add_embed_field(qfield, field)
117
118    s = sb.finish()
119
120    return s
121
122
123# def generate_source(fields, log_ignore=False):
124#    s = ""
125#
126#    if log_ignore:
127#
128#    s = """
129#        def log_score = _score < 1.0 ? _score : Math.log(_score)/Math.log(params.norm_weight);
130#        def weights = params.weights;""".strip()+"\n"
131#
132#    variables = []
133#
134#    for i, field in enumerate(fields):
135#        field = field.replace(".", '_') + '_Embedding'
136#        s += f"double {field}_score = doc['{field}'].size() == 0 ? 0 : weights[{i}]*cosineSimilarity(params.q_eb, '{field}') + params.offset;\n"
137#
138#        variables.append(f"{field}_score")
139#
140#    s = s.strip()
141#
142#    s = s + "\n double embed_score = " + " + ".join(variables) + ";"
143#    s = s + " \n return params.disable_bm25 == true ? embed_score : embed_score + Math.log(_score)/Math.log(params.norm_weight);"
144#
145#    return s
146
147
148def check_params_is_valid(params, qfields):
149    """
150    Validate if the parameters for the script score passes a simple sanity check.
151
152    :param params:
153    :param qfields:
154    """
155    for qfield in qfields:
156        assert qfield in params
157
158    assert "weights" in params
159    assert "offset" in params
160
161
162def generate_script(
163        fields, params, source_generator=generate_source, qfields="q_eb"
164) -> Dict:
165    """
166    Parameters for creating the script
167
168    :param fields: Document fields to search
169    :param params: Parameters for the script
170    :param source_generator:  Function that will generate the script
171    :param qfields: Query fields to search from (topic facets)
172    :return:
173    """
174    script = copy.deepcopy(base_script)
175    check_params_is_valid(params, qfields)
176
177    script["lang"] = "painless"
178    script["source"] = source_generator(qfields, fields)
179    script["params"] = params
180
181    return script
class SourceBuilder:
13class SourceBuilder:
14    """
15    Builds Script Score source for NIR-style queries in elasticsearch
16    Uses the painless language
17
18    This is a string builder class
19    """
20
21    def __init__(self):
22        self.s = ""
23        self.i = 0
24        self.variables = []
25
26    def _add_line(self, line):
27        self.s = self.s + line.strip() + "\n"
28
29    def add_preamble(self):
30        """
31        Adds preamble to the internal string
32        This will return the bm25 score if the normalization constant is below 0
33        """
34        self._add_line(
35            """
36            if (params.norm_weight < 0.0) {
37                return _score;
38            }
39        """
40        )
41
42    def add_log_score(self, ignore_below_one=False) -> "SourceBuilder":
43        """
44        Adds the BM25 log score line
45        :param ignore_below_one: Ignore all scores below 1.0 as Log(1) = 0. Otherwise, just ignore Log(0 and under).
46        :return:
47            SourceBuilder
48        """
49        if ignore_below_one:
50            self._add_line(
51                # "def log_score = _score < 1.0 ? 0.0 : Math.log(_score)/Math.log(params.norm_weight);"
52                "def log_score = params.disable_bm25 ? 0.0 : Math.log(_score)/Math.log(params.norm_weight);"
53                # "def log_score = Math.log(_score)/Math.log(params.norm_weight);"
54            )
55        else:
56            self._add_line(
57                "def log_score = _score <= 0.0 ? 0.0 : Math.log(_score)/Math.log(params.norm_weight);"
58                # "def log_score = Math.log(_score)/Math.log(params.norm_weight);"
59            )
60
61        return self
62
63    def add_embed_field(self, qfield, field) -> "SourceBuilder":
64        """
65        Adds a cosine score line.
66        :param qfield: Query field
67        :param field: Document facet field
68        :return:
69        """
70        if "embedding" not in field.lower():
71            field = field.replace(".", "_") + "_Embedding"
72
73        variable_name = f"{field}_{qfield}_score"
74
75        self._add_line(
76            f"double {variable_name} = doc['{field}'].isEmpty() ? 0.0 : params.weights[{self.i}]*cosineSimilarity(params.{qfield}"
77            f", '{field}') + params.offset; "
78            # f"double {variable_name} = cosineSimilarity(params.{qfield}, '{field}') + 1.0; "
79        )
80        self.variables.append(variable_name)
81
82        self.i += 1
83
84        return self
85
86    def finish(self):
87        """
88        Finalises the script score and returns the internal string
89        :return:
90            A string containing the script score query
91        """
92        self._add_line("double embed_score = " + " + ".join(self.variables) + ";")
93        self._add_line(
94            # "return params.disable_bm25 == true ? embed_score : embed_score + log_score;"
95            "return embed_score + log_score;"
96        )
97
98        return self.s

Builds Script Score source for NIR-style queries in elasticsearch Uses the painless language

This is a string builder class

SourceBuilder()
21    def __init__(self):
22        self.s = ""
23        self.i = 0
24        self.variables = []
def add_preamble(self):
29    def add_preamble(self):
30        """
31        Adds preamble to the internal string
32        This will return the bm25 score if the normalization constant is below 0
33        """
34        self._add_line(
35            """
36            if (params.norm_weight < 0.0) {
37                return _score;
38            }
39        """
40        )

Adds preamble to the internal string This will return the bm25 score if the normalization constant is below 0

def add_log_score( self, ignore_below_one=False) -> debeir.engines.elasticsearch.generate_script_score.SourceBuilder:
42    def add_log_score(self, ignore_below_one=False) -> "SourceBuilder":
43        """
44        Adds the BM25 log score line
45        :param ignore_below_one: Ignore all scores below 1.0 as Log(1) = 0. Otherwise, just ignore Log(0 and under).
46        :return:
47            SourceBuilder
48        """
49        if ignore_below_one:
50            self._add_line(
51                # "def log_score = _score < 1.0 ? 0.0 : Math.log(_score)/Math.log(params.norm_weight);"
52                "def log_score = params.disable_bm25 ? 0.0 : Math.log(_score)/Math.log(params.norm_weight);"
53                # "def log_score = Math.log(_score)/Math.log(params.norm_weight);"
54            )
55        else:
56            self._add_line(
57                "def log_score = _score <= 0.0 ? 0.0 : Math.log(_score)/Math.log(params.norm_weight);"
58                # "def log_score = Math.log(_score)/Math.log(params.norm_weight);"
59            )
60
61        return self

Adds the BM25 log score line

Parameters
  • ignore_below_one: Ignore all scores below 1.0 as Log(1) = 0. Otherwise, just ignore Log(0 and under).
Returns
SourceBuilder
def add_embed_field( self, qfield, field) -> debeir.engines.elasticsearch.generate_script_score.SourceBuilder:
63    def add_embed_field(self, qfield, field) -> "SourceBuilder":
64        """
65        Adds a cosine score line.
66        :param qfield: Query field
67        :param field: Document facet field
68        :return:
69        """
70        if "embedding" not in field.lower():
71            field = field.replace(".", "_") + "_Embedding"
72
73        variable_name = f"{field}_{qfield}_score"
74
75        self._add_line(
76            f"double {variable_name} = doc['{field}'].isEmpty() ? 0.0 : params.weights[{self.i}]*cosineSimilarity(params.{qfield}"
77            f", '{field}') + params.offset; "
78            # f"double {variable_name} = cosineSimilarity(params.{qfield}, '{field}') + 1.0; "
79        )
80        self.variables.append(variable_name)
81
82        self.i += 1
83
84        return self

Adds a cosine score line.

Parameters
  • qfield: Query field
  • field: Document facet field
Returns
def finish(self):
86    def finish(self):
87        """
88        Finalises the script score and returns the internal string
89        :return:
90            A string containing the script score query
91        """
92        self._add_line("double embed_score = " + " + ".join(self.variables) + ";")
93        self._add_line(
94            # "return params.disable_bm25 == true ? embed_score : embed_score + log_score;"
95            "return embed_score + log_score;"
96        )
97
98        return self.s

Finalises the script score and returns the internal string

Returns
A string containing the script score query
def generate_source(qfields: Union[list, str], fields) -> str:
101def generate_source(qfields: Union[list, str], fields) -> str:
102    """
103    Generates the script source based off a set of input fields and facets
104
105    :param qfields: Query fields (or topic fields)
106    :param fields: Document facets to compute cosine similarity on
107    :return:
108    """
109    sb = SourceBuilder()
110    sb.add_log_score(ignore_below_one=True)
111
112    if isinstance(qfields, str):
113        qfields = [qfields]
114
115    for qfield in qfields:
116        for field in fields:
117            sb.add_embed_field(qfield, field)
118
119    s = sb.finish()
120
121    return s

Generates the script source based off a set of input fields and facets

Parameters
  • qfields: Query fields (or topic fields)
  • fields: Document facets to compute cosine similarity on
Returns
def check_params_is_valid(params, qfields):
149def check_params_is_valid(params, qfields):
150    """
151    Validate if the parameters for the script score passes a simple sanity check.
152
153    :param params:
154    :param qfields:
155    """
156    for qfield in qfields:
157        assert qfield in params
158
159    assert "weights" in params
160    assert "offset" in params

Validate if the parameters for the script score passes a simple sanity check.

Parameters
  • params:
  • qfields:
def generate_script( fields, params, source_generator=<function generate_source>, qfields='q_eb') -> Dict:
163def generate_script(
164        fields, params, source_generator=generate_source, qfields="q_eb"
165) -> Dict:
166    """
167    Parameters for creating the script
168
169    :param fields: Document fields to search
170    :param params: Parameters for the script
171    :param source_generator:  Function that will generate the script
172    :param qfields: Query fields to search from (topic facets)
173    :return:
174    """
175    script = copy.deepcopy(base_script)
176    check_params_is_valid(params, qfields)
177
178    script["lang"] = "painless"
179    script["source"] = source_generator(qfields, fields)
180    script["params"] = params
181
182    return script

Parameters for creating the script

Parameters
  • fields: Document fields to search
  • params: Parameters for the script
  • source_generator: Function that will generate the script
  • qfields: Query fields to search from (topic facets)
Returns